Fix unidecode

2025-06-07 20:15:24 +00:00 · 2024-04-04 15:21:40 +02:00 · 2024-04-04 15:21:40 +02:00 · 3a99ca2afe
commit 3a99ca2afe
parent 9d1b7536b7
3 changed files with 154 additions and 149 deletions
--- a/Src/Api/series.py
+++ b/Src/Api/series.py
@ -5,7 +5,7 @@ from Src.Util.console import console, msg
 from Src.Util.config import config_manager
 from Src.Util.table import TVShowManager
 from Src.Util.message import start_message
-from Src.Lib.Unidecode import unidecode
+from Src.Lib.Unidecode import transliterate
 from Src.Lib.FFmpeg.my_m3u8 import Downloader
 from .Class import VideoSource
@ -110,7 +110,7 @@ def donwload_video(tv_name: str, index_season_selected: int, index_episode_selec
    episode_id = video_source.obj_episode_manager.episodes[index_episode_selected - 1].id
    # Define filename and path for the downloaded video
-    mp4_name = f"{index_episode_selected}_{unidecode(video_source.obj_episode_manager.episodes[index_episode_selected - 1].name)}.mp4"
+    mp4_name = f"{index_episode_selected}_{transliterate(video_source.obj_episode_manager.episodes[index_episode_selected - 1].name)}.mp4"
    mp4_path = os.path.join(ROOT_PATH, SERIES_FOLDER, tv_name, f"S{index_season_selected}")
    os.makedirs(mp4_path, exist_ok=True)
--- a/Src/Lib/Unidecode/init.py
+++ b/Src/Lib/Unidecode/init.py
@ -1,138 +1,143 @@
-# -*- coding: utf-8 -*-
+# 04.04.24
 # vi:tabstop=4:expandtab:sw=4
 """Transliterate Unicode text into plain 7-bit ASCII.
 Example usage:
->>> from unidecode import unidecode
+# Import
->>> unidecode("\u5317\u4EB0")
+import os
-"Bei Jing "
+import logging
 import importlib.util
-The transliteration uses a straightforward map, and doesn't have alternatives
+# Variable
-for the same character based on language, position, or anything else.
+Cache = {}
 A standard string object will be returned. If you need bytes, use:
 >>> unidecode("Κνωσός").encode("ascii")
 b'Knosos'
 """
 import warnings
 from typing import Dict, Optional, Sequence
 Cache = {} # type: Dict[int, Optional[Sequence[Optional[str]]]]
 class UnidecodeError(ValueError):
-    def __init__(self, message: str, index: Optional[int] = None) -> None:
+    pass
        """Raised for Unidecode-related errors.
-        The index attribute contains the index of the character that caused
+def transliterate_nonascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str:
-        the error.
+    """Transliterates non-ASCII characters in a string to their ASCII counterparts.
        """
        super(UnidecodeError, self).__init__(message)
        self.index = index
    Args:
        string (str): The input string containing non-ASCII characters.
        errors (str): Specifies the treatment of errors. Can be 'ignore', 'strict', 'replace', or 'preserve'.
        replace_str (str): The replacement string used when errors='replace'.
-def unidecode_expect_ascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str:
+    Returns:
-    """Transliterate an Unicode object into an ASCII string
+        str: The transliterated string with non-ASCII characters replaced.
    >>> unidecode("\u5317\u4EB0")
    "Bei Jing "
    This function first tries to convert the string using ASCII codec.
    If it fails (because of non-ASCII characters), it falls back to
    transliteration using the character tables.
    This is approx. five times faster if the string only contains ASCII
    characters, but slightly slower than unicode_expect_nonascii if
    non-ASCII characters are present.
    errors specifies what to do with characters that have not been
    found in replacement tables. The default is 'ignore' which ignores
    the character. 'strict' raises an UnidecodeError. 'replace'
    substitutes the character with replace_str (default is '?').
    'preserve' keeps the original character.
    Note that if 'preserve' is used the returned string might not be
    ASCII!
    """
    return _transliterate(string, errors, replace_str)
-    try:
+def _get_ascii_representation(char: str) -> str:
-        bytestring = string.encode('ASCII')
+    """Obtains the ASCII representation of a Unicode character.
    except UnicodeEncodeError:
        pass
    else:
        return string
-    return _unidecode(string, errors, replace_str)
+    Args:
        char (str): The Unicode character.
-def unidecode_expect_nonascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str:
+    Returns:
-    """Transliterate an Unicode object into an ASCII string
+        str: The ASCII representation of the character.
    >>> unidecode("\u5317\u4EB0")
    "Bei Jing "
    See unidecode_expect_ascii.
    """
    return _unidecode(string, errors, replace_str)
 unidecode = unidecode_expect_ascii
 def _get_repl_str(char: str) -> Optional[str]:
    codepoint = ord(char)
    # If the character is ASCII, return it as is
    if codepoint < 0x80:
        # Already ASCII
        return str(char)
    # Ignore characters outside the BMP (Basic Multilingual Plane)
    if codepoint > 0xeffff:
        # No data on characters in Private Use Area and above.
        return None
    # Warn about surrogate characters
    if 0xd800 <= codepoint <= 0xdfff:
-        warnings.warn(  "Surrogate character %r will be ignored. "
+        logging.warning("Surrogate character %r will be ignored. "
-                        "You might be using a narrow Python build." % (char,),
+                        "You might be using a narrow Python build.", char)
                        RuntimeWarning, 2)
-    section = codepoint >> 8   # Chop off the last two hex digits
+    # Calculate section and position
-    position = codepoint % 256 # Last two hex digits
+    section = codepoint >> 8
    position = codepoint % 256 
    try:
        # Look up the character in the cache
        table = Cache[section]
    except KeyError:
        try:
-            mod = __import__('unidecode.x%03x'%(section), globals(), locals(), ['data'])
+            # Import the module corresponding to the section
            module_name = f"x{section:03x}.py"
            main = os.path.abspath(os.path.dirname(__file__))
            module_path = os.path.join(main, module_name)
            spec = importlib.util.spec_from_file_location(module_name, module_path)
            module = importlib.util.module_from_spec(spec)
            spec.loader.exec_module(module)
        except ImportError:
-            # No data on this character
+            # If module import fails, set cache entry to None and return
            Cache[section] = None
            return None
-        Cache[section] = table = mod.data
+        # Update cache with module data
        Cache[section] = table = module.data
    # Return the ASCII representation if found, otherwise None
    if table and len(table) > position:
        return table[position]
    else:
        return None
-def _unidecode(string: str, errors: str, replace_str:str) -> str:
+def _transliterate(string: str, errors: str, replace_str: str) -> str:
    """Main transliteration function.
    Args:
        string (str): The input string.
        errors (str): Specifies the treatment of errors. Can be 'ignore', 'strict', 'replace', or 'preserve'.
        replace_str (str): The replacement string used when errors='replace'.
    Returns:
        str: The transliterated string.
    """
    retval = []
-    for index, char in enumerate(string):
+    for char in string:
-        repl = _get_repl_str(char)
+        # Get the ASCII representation of the character
        ascii_char = _get_ascii_representation(char)
-        if repl is None:
+        if ascii_char is None:
            # Handle errors based on the specified policy
            if errors == 'ignore':
-                repl = ''
+                ascii_char = ''
            elif errors == 'strict':
-                raise UnidecodeError('no replacement found for character %r '
+                logging.error(f'No replacement found for character {char!r}')
-                        'in position %d' % (char, index), index)
+                raise UnidecodeError(f'no replacement found for character {char!r}')
            elif errors == 'replace':
-                repl = replace_str
+                ascii_char = replace_str
            elif errors == 'preserve':
-                repl = char
+                ascii_char = char
            else:
-                raise UnidecodeError('invalid value for errors parameter %r' % (errors,))
+                logging.error(f'Invalid value for errors parameter {errors!r}')
                raise UnidecodeError(f'invalid value for errors parameter {errors!r}')
-        retval.append(repl)
+        # Append the ASCII representation to the result
        retval.append(ascii_char)
    return ''.join(retval)
 def transliterate_expect_ascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str:
    """Transliterates non-ASCII characters in a string, expecting ASCII input.
    Args:
        string (str): The input string containing non-ASCII characters.
        errors (str): Specifies the treatment of errors. Can be 'ignore', 'strict', 'replace', or 'preserve'.
        replace_str (str): The replacement string used when errors='replace'.
    Returns:
        str: The transliterated string with non-ASCII characters replaced.
    """
    try:
        # Check if the string can be encoded as ASCII
        string.encode('ASCII')
    except UnicodeEncodeError:
        # If encoding fails, fall back to transliteration
        pass
    else:
        # If the string is already ASCII, return it as is
        return string
    # Otherwise, transliterate non-ASCII characters
    return _transliterate(string, errors, replace_str)
 transliterate = transliterate_expect_ascii
--- a/Src/Lib/Unidecode/x000.py
+++ b/Src/Lib/Unidecode/x000.py
@ -42,95 +42,95 @@ data = (
 '',    # 0x9d
 '',    # 0x9e
 '',    # 0x9f
-' ',   # 0xa0
+' ',    # 0xa0
-'!',   # 0xa1
+'!',    # 0xa1
-'C/',  # 0xa2
+'C/',    # 0xa2
 # Not "GBP" - Pound Sign is used for more than just British Pounds.
 'PS',    # 0xa3
 '$?',    # 0xa4
 'Y=',    # 0xa5
-'|',     # 0xa6
+'|',    # 0xa6
 'SS',    # 0xa7
-'"',     # 0xa8
+'"',    # 0xa8
-'(c)',   # 0xa9
+'(c)',    # 0xa9
-'a',     # 0xaa
+'a',    # 0xaa
 '<<',    # 0xab
-'!',     # 0xac
+'!',    # 0xac
-'',      # 0xad
+'',    # 0xad
-'(r)',   # 0xae
+'(r)',    # 0xae
-'-',     # 0xaf
+'-',    # 0xaf
-'deg',   # 0xb0
+'deg',    # 0xb0
 '+-',    # 0xb1
 # These might be combined with other superscript digits (u+2070 - u+2079)
-'2',     # 0xb2
+'2',    # 0xb2
-'3',     # 0xb3
+'3',    # 0xb3
 '\'',    # 0xb4
-'u',     # 0xb5
+'u',    # 0xb5
-'P',     # 0xb6
+'P',    # 0xb6
-'*',     # 0xb7
+'*',    # 0xb7
-',',     # 0xb8
+',',    # 0xb8
-'1',     # 0xb9
+'1',    # 0xb9
-'o',     # 0xba
+'o',    # 0xba
 '>>',    # 0xbb
-' 1/4',  # 0xbc
+' 1/4',    # 0xbc
-' 1/2',  # 0xbd
+' 1/2',    # 0xbd
-' 3/4',  # 0xbe
+' 3/4',    # 0xbe
-'?',     # 0xbf
+'?',    # 0xbf
-'A',     # 0xc0
+'A',    # 0xc0
-'A',     # 0xc1
+'A',    # 0xc1
-'A',     # 0xc2
+'A',    # 0xc2
-'A',     # 0xc3
+'A',    # 0xc3
 # Not "AE" - used in languages other than German
 'A',    # 0xc4
-'A',     # 0xc5
+'A',    # 0xc5
 'AE',    # 0xc6
-'C',     # 0xc7
+'C',    # 0xc7
-'E',     # 0xc8
+'E',    # 0xc8
-'E',     # 0xc9
+'E',    # 0xc9
-'E',     # 0xca
+'E',    # 0xca
-'E',     # 0xcb
+'E',    # 0xcb
-'I',     # 0xcc
+'I',    # 0xcc
-'I',     # 0xcd
+'I',    # 0xcd
-'I',     # 0xce
+'I',    # 0xce
-'I',     # 0xcf
+'I',    # 0xcf
-'D',     # 0xd0
+'D',    # 0xd0
-'N',     # 0xd1
+'N',    # 0xd1
-'O',     # 0xd2
+'O',    # 0xd2
-'O',     # 0xd3
+'O',    # 0xd3
-'O',     # 0xd4
+'O',    # 0xd4
-'O',     # 0xd5
+'O',    # 0xd5
 # Not "OE" - used in languages other than German
-'O',     # 0xd6
+'O',    # 0xd6
-'x',     # 0xd7
+'x',    # 0xd7
-'O',     # 0xd8
+'O',    # 0xd8
-'U',     # 0xd9
+'U',    # 0xd9
-'U',     # 0xda
+'U',    # 0xda
-'U',     # 0xdb
+'U',    # 0xdb
 # Not "UE" - used in languages other than German
-'U',     # 0xdc
+'U',    # 0xdc
-'Y',     # 0xdd
+'Y',    # 0xdd
 'Th',    # 0xde
 'ss',    # 0xdf
-'a',     # 0xe0
+'a',    # 0xe0
-'a',     # 0xe1
+'a',    # 0xe1
-'a',     # 0xe2
+'a',    # 0xe2
-'a',     # 0xe3
+'a',    # 0xe3
 # Not "ae" - used in languages other than German
 'a',    # 0xe4
 'a',    # 0xe5
-'ae',   # 0xe6
+'ae',    # 0xe6
 'c',    # 0xe7
 'e',    # 0xe8
 'e',    # 0xe9
@ -160,6 +160,6 @@ data = (
 'u',    # 0xfc
 'y',    # 0xfd
-'th',   # 0xfe
+'th',    # 0xfe
 'y',    # 0xff
 )