diff --git a/Src/Api/series.py b/Src/Api/series.py index 327c03d..6b9042c 100644 --- a/Src/Api/series.py +++ b/Src/Api/series.py @@ -5,7 +5,7 @@ from Src.Util.console import console, msg from Src.Util.config import config_manager from Src.Util.table import TVShowManager from Src.Util.message import start_message -from Src.Lib.Unidecode import unidecode +from Src.Lib.Unidecode import transliterate from Src.Lib.FFmpeg.my_m3u8 import Downloader from .Class import VideoSource @@ -110,7 +110,7 @@ def donwload_video(tv_name: str, index_season_selected: int, index_episode_selec episode_id = video_source.obj_episode_manager.episodes[index_episode_selected - 1].id # Define filename and path for the downloaded video - mp4_name = f"{index_episode_selected}_{unidecode(video_source.obj_episode_manager.episodes[index_episode_selected - 1].name)}.mp4" + mp4_name = f"{index_episode_selected}_{transliterate(video_source.obj_episode_manager.episodes[index_episode_selected - 1].name)}.mp4" mp4_path = os.path.join(ROOT_PATH, SERIES_FOLDER, tv_name, f"S{index_season_selected}") os.makedirs(mp4_path, exist_ok=True) diff --git a/Src/Lib/Unidecode/__init__.py b/Src/Lib/Unidecode/__init__.py index 5633c45..ebce398 100644 --- a/Src/Lib/Unidecode/__init__.py +++ b/Src/Lib/Unidecode/__init__.py @@ -1,138 +1,143 @@ -# -*- coding: utf-8 -*- -# vi:tabstop=4:expandtab:sw=4 -"""Transliterate Unicode text into plain 7-bit ASCII. +# 04.04.24 -Example usage: ->>> from unidecode import unidecode ->>> unidecode("\u5317\u4EB0") -"Bei Jing " +# Import +import os +import logging +import importlib.util -The transliteration uses a straightforward map, and doesn't have alternatives -for the same character based on language, position, or anything else. - -A standard string object will be returned. If you need bytes, use: - ->>> unidecode("Κνωσός").encode("ascii") -b'Knosos' -""" -import warnings -from typing import Dict, Optional, Sequence - -Cache = {} # type: Dict[int, Optional[Sequence[Optional[str]]]] +# Variable +Cache = {} class UnidecodeError(ValueError): - def __init__(self, message: str, index: Optional[int] = None) -> None: - """Raised for Unidecode-related errors. + pass - The index attribute contains the index of the character that caused - the error. - """ - super(UnidecodeError, self).__init__(message) - self.index = index +def transliterate_nonascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str: + """Transliterates non-ASCII characters in a string to their ASCII counterparts. + Args: + string (str): The input string containing non-ASCII characters. + errors (str): Specifies the treatment of errors. Can be 'ignore', 'strict', 'replace', or 'preserve'. + replace_str (str): The replacement string used when errors='replace'. -def unidecode_expect_ascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str: - """Transliterate an Unicode object into an ASCII string - - >>> unidecode("\u5317\u4EB0") - "Bei Jing " - - This function first tries to convert the string using ASCII codec. - If it fails (because of non-ASCII characters), it falls back to - transliteration using the character tables. - - This is approx. five times faster if the string only contains ASCII - characters, but slightly slower than unicode_expect_nonascii if - non-ASCII characters are present. - - errors specifies what to do with characters that have not been - found in replacement tables. The default is 'ignore' which ignores - the character. 'strict' raises an UnidecodeError. 'replace' - substitutes the character with replace_str (default is '?'). - 'preserve' keeps the original character. - - Note that if 'preserve' is used the returned string might not be - ASCII! + Returns: + str: The transliterated string with non-ASCII characters replaced. """ + return _transliterate(string, errors, replace_str) - try: - bytestring = string.encode('ASCII') - except UnicodeEncodeError: - pass - else: - return string +def _get_ascii_representation(char: str) -> str: + """Obtains the ASCII representation of a Unicode character. - return _unidecode(string, errors, replace_str) + Args: + char (str): The Unicode character. -def unidecode_expect_nonascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str: - """Transliterate an Unicode object into an ASCII string - - >>> unidecode("\u5317\u4EB0") - "Bei Jing " - - See unidecode_expect_ascii. + Returns: + str: The ASCII representation of the character. """ - - return _unidecode(string, errors, replace_str) - -unidecode = unidecode_expect_ascii - -def _get_repl_str(char: str) -> Optional[str]: codepoint = ord(char) + # If the character is ASCII, return it as is if codepoint < 0x80: - # Already ASCII return str(char) + # Ignore characters outside the BMP (Basic Multilingual Plane) if codepoint > 0xeffff: - # No data on characters in Private Use Area and above. return None + # Warn about surrogate characters if 0xd800 <= codepoint <= 0xdfff: - warnings.warn( "Surrogate character %r will be ignored. " - "You might be using a narrow Python build." % (char,), - RuntimeWarning, 2) + logging.warning("Surrogate character %r will be ignored. " + "You might be using a narrow Python build.", char) - section = codepoint >> 8 # Chop off the last two hex digits - position = codepoint % 256 # Last two hex digits + # Calculate section and position + section = codepoint >> 8 + position = codepoint % 256 try: + # Look up the character in the cache table = Cache[section] except KeyError: try: - mod = __import__('unidecode.x%03x'%(section), globals(), locals(), ['data']) + # Import the module corresponding to the section + module_name = f"x{section:03x}.py" + main = os.path.abspath(os.path.dirname(__file__)) + module_path = os.path.join(main, module_name) + spec = importlib.util.spec_from_file_location(module_name, module_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) except ImportError: - # No data on this character + # If module import fails, set cache entry to None and return Cache[section] = None return None - Cache[section] = table = mod.data + # Update cache with module data + Cache[section] = table = module.data + # Return the ASCII representation if found, otherwise None if table and len(table) > position: return table[position] else: return None -def _unidecode(string: str, errors: str, replace_str:str) -> str: +def _transliterate(string: str, errors: str, replace_str: str) -> str: + """Main transliteration function. + + Args: + string (str): The input string. + errors (str): Specifies the treatment of errors. Can be 'ignore', 'strict', 'replace', or 'preserve'. + replace_str (str): The replacement string used when errors='replace'. + + Returns: + str: The transliterated string. + """ retval = [] - for index, char in enumerate(string): - repl = _get_repl_str(char) + for char in string: + # Get the ASCII representation of the character + ascii_char = _get_ascii_representation(char) - if repl is None: + if ascii_char is None: + # Handle errors based on the specified policy if errors == 'ignore': - repl = '' + ascii_char = '' elif errors == 'strict': - raise UnidecodeError('no replacement found for character %r ' - 'in position %d' % (char, index), index) + logging.error(f'No replacement found for character {char!r}') + raise UnidecodeError(f'no replacement found for character {char!r}') elif errors == 'replace': - repl = replace_str + ascii_char = replace_str elif errors == 'preserve': - repl = char + ascii_char = char else: - raise UnidecodeError('invalid value for errors parameter %r' % (errors,)) + logging.error(f'Invalid value for errors parameter {errors!r}') + raise UnidecodeError(f'invalid value for errors parameter {errors!r}') - retval.append(repl) + # Append the ASCII representation to the result + retval.append(ascii_char) return ''.join(retval) + +def transliterate_expect_ascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str: + """Transliterates non-ASCII characters in a string, expecting ASCII input. + + Args: + string (str): The input string containing non-ASCII characters. + errors (str): Specifies the treatment of errors. Can be 'ignore', 'strict', 'replace', or 'preserve'. + replace_str (str): The replacement string used when errors='replace'. + + Returns: + str: The transliterated string with non-ASCII characters replaced. + """ + try: + # Check if the string can be encoded as ASCII + string.encode('ASCII') + except UnicodeEncodeError: + # If encoding fails, fall back to transliteration + pass + else: + # If the string is already ASCII, return it as is + return string + + # Otherwise, transliterate non-ASCII characters + return _transliterate(string, errors, replace_str) + +transliterate = transliterate_expect_ascii diff --git a/Src/Lib/Unidecode/x000.py b/Src/Lib/Unidecode/x000.py index 6b3d1fc..2c288a6 100644 --- a/Src/Lib/Unidecode/x000.py +++ b/Src/Lib/Unidecode/x000.py @@ -42,95 +42,95 @@ data = ( '', # 0x9d '', # 0x9e '', # 0x9f -' ', # 0xa0 -'!', # 0xa1 -'C/', # 0xa2 +' ', # 0xa0 +'!', # 0xa1 +'C/', # 0xa2 # Not "GBP" - Pound Sign is used for more than just British Pounds. 'PS', # 0xa3 '$?', # 0xa4 'Y=', # 0xa5 -'|', # 0xa6 +'|', # 0xa6 'SS', # 0xa7 -'"', # 0xa8 -'(c)', # 0xa9 -'a', # 0xaa +'"', # 0xa8 +'(c)', # 0xa9 +'a', # 0xaa '<<', # 0xab -'!', # 0xac -'', # 0xad -'(r)', # 0xae -'-', # 0xaf -'deg', # 0xb0 +'!', # 0xac +'', # 0xad +'(r)', # 0xae +'-', # 0xaf +'deg', # 0xb0 '+-', # 0xb1 # These might be combined with other superscript digits (u+2070 - u+2079) -'2', # 0xb2 -'3', # 0xb3 +'2', # 0xb2 +'3', # 0xb3 '\'', # 0xb4 -'u', # 0xb5 -'P', # 0xb6 -'*', # 0xb7 -',', # 0xb8 -'1', # 0xb9 -'o', # 0xba +'u', # 0xb5 +'P', # 0xb6 +'*', # 0xb7 +',', # 0xb8 +'1', # 0xb9 +'o', # 0xba '>>', # 0xbb -' 1/4', # 0xbc -' 1/2', # 0xbd -' 3/4', # 0xbe -'?', # 0xbf -'A', # 0xc0 -'A', # 0xc1 -'A', # 0xc2 -'A', # 0xc3 +' 1/4', # 0xbc +' 1/2', # 0xbd +' 3/4', # 0xbe +'?', # 0xbf +'A', # 0xc0 +'A', # 0xc1 +'A', # 0xc2 +'A', # 0xc3 # Not "AE" - used in languages other than German 'A', # 0xc4 -'A', # 0xc5 +'A', # 0xc5 'AE', # 0xc6 -'C', # 0xc7 -'E', # 0xc8 -'E', # 0xc9 -'E', # 0xca -'E', # 0xcb -'I', # 0xcc -'I', # 0xcd -'I', # 0xce -'I', # 0xcf -'D', # 0xd0 -'N', # 0xd1 -'O', # 0xd2 -'O', # 0xd3 -'O', # 0xd4 -'O', # 0xd5 +'C', # 0xc7 +'E', # 0xc8 +'E', # 0xc9 +'E', # 0xca +'E', # 0xcb +'I', # 0xcc +'I', # 0xcd +'I', # 0xce +'I', # 0xcf +'D', # 0xd0 +'N', # 0xd1 +'O', # 0xd2 +'O', # 0xd3 +'O', # 0xd4 +'O', # 0xd5 # Not "OE" - used in languages other than German -'O', # 0xd6 +'O', # 0xd6 -'x', # 0xd7 -'O', # 0xd8 -'U', # 0xd9 -'U', # 0xda -'U', # 0xdb +'x', # 0xd7 +'O', # 0xd8 +'U', # 0xd9 +'U', # 0xda +'U', # 0xdb # Not "UE" - used in languages other than German -'U', # 0xdc +'U', # 0xdc -'Y', # 0xdd +'Y', # 0xdd 'Th', # 0xde 'ss', # 0xdf -'a', # 0xe0 -'a', # 0xe1 -'a', # 0xe2 -'a', # 0xe3 +'a', # 0xe0 +'a', # 0xe1 +'a', # 0xe2 +'a', # 0xe3 # Not "ae" - used in languages other than German 'a', # 0xe4 'a', # 0xe5 -'ae', # 0xe6 +'ae', # 0xe6 'c', # 0xe7 'e', # 0xe8 'e', # 0xe9 @@ -160,6 +160,6 @@ data = ( 'u', # 0xfc 'y', # 0xfd -'th', # 0xfe +'th', # 0xfe 'y', # 0xff )