Fix unidecode

2025-07-23 18:40:05 +00:00 · 2024-04-04 15:21:40 +02:00 · 2024-04-04 15:21:40 +02:00 · 3a99ca2afe
commit 3a99ca2afe
parent 9d1b7536b7
3 changed files with 154 additions and 149 deletions
--- a/Src/Api/series.py
+++ b/Src/Api/series.py
@ -5,7 +5,7 @@ from Src.Util.console import console, msg
 from Src.Util.config import config_manager
 from Src.Util.table import TVShowManager
 from Src.Util.message import start_message
-from Src.Lib.Unidecode import unidecode
+from Src.Lib.Unidecode import transliterate
 from Src.Lib.FFmpeg.my_m3u8 import Downloader
 from .Class import VideoSource

@ -110,7 +110,7 @@ def donwload_video(tv_name: str, index_season_selected: int, index_episode_selec
    episode_id = video_source.obj_episode_manager.episodes[index_episode_selected - 1].id

    # Define filename and path for the downloaded video
-    mp4_name = f"{index_episode_selected}_{unidecode(video_source.obj_episode_manager.episodes[index_episode_selected - 1].name)}.mp4"
+    mp4_name = f"{index_episode_selected}_{transliterate(video_source.obj_episode_manager.episodes[index_episode_selected - 1].name)}.mp4"
    mp4_path = os.path.join(ROOT_PATH, SERIES_FOLDER, tv_name, f"S{index_season_selected}")
    os.makedirs(mp4_path, exist_ok=True)

--- a/Src/Lib/Unidecode/init.py
+++ b/Src/Lib/Unidecode/init.py
@ -1,138 +1,143 @@
-# -*- coding: utf-8 -*-
-# vi:tabstop=4:expandtab:sw=4
-"""Transliterate Unicode text into plain 7-bit ASCII.
+# 04.04.24

-Example usage:

->>> from unidecode import unidecode
->>> unidecode("\u5317\u4EB0")
-"Bei Jing "
+# Import
+import os
+import logging
+import importlib.util

-The transliteration uses a straightforward map, and doesn't have alternatives
-for the same character based on language, position, or anything else.
-
-A standard string object will be returned. If you need bytes, use:
-
->>> unidecode("Κνωσός").encode("ascii")
-b'Knosos'
-"""
-import warnings
-from typing import Dict, Optional, Sequence
-
-Cache = {} # type: Dict[int, Optional[Sequence[Optional[str]]]]
+# Variable
+Cache = {}

 class UnidecodeError(ValueError):
-    def __init__(self, message: str, index: Optional[int] = None) -> None:
-        """Raised for Unidecode-related errors.
+    pass

-        The index attribute contains the index of the character that caused
-        the error.
-        """
-        super(UnidecodeError, self).__init__(message)
-        self.index = index
+def transliterate_nonascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str:
+    """Transliterates non-ASCII characters in a string to their ASCII counterparts.

+    Args:
+        string (str): The input string containing non-ASCII characters.
+        errors (str): Specifies the treatment of errors. Can be 'ignore', 'strict', 'replace', or 'preserve'.
+        replace_str (str): The replacement string used when errors='replace'.

-def unidecode_expect_ascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str:
-    """Transliterate an Unicode object into an ASCII string
-
-    >>> unidecode("\u5317\u4EB0")
-    "Bei Jing "
-
-    This function first tries to convert the string using ASCII codec.
-    If it fails (because of non-ASCII characters), it falls back to
-    transliteration using the character tables.
-
-    This is approx. five times faster if the string only contains ASCII
-    characters, but slightly slower than unicode_expect_nonascii if
-    non-ASCII characters are present.
-
-    errors specifies what to do with characters that have not been
-    found in replacement tables. The default is 'ignore' which ignores
-    the character. 'strict' raises an UnidecodeError. 'replace'
-    substitutes the character with replace_str (default is '?').
-    'preserve' keeps the original character.
-
-    Note that if 'preserve' is used the returned string might not be
-    ASCII!
+    Returns:
+        str: The transliterated string with non-ASCII characters replaced.
    """
+    return _transliterate(string, errors, replace_str)

-    try:
-        bytestring = string.encode('ASCII')
-    except UnicodeEncodeError:
-        pass
-    else:
-        return string
+def _get_ascii_representation(char: str) -> str:
+    """Obtains the ASCII representation of a Unicode character.

-    return _unidecode(string, errors, replace_str)
+    Args:
+        char (str): The Unicode character.

-def unidecode_expect_nonascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str:
-    """Transliterate an Unicode object into an ASCII string
-
-    >>> unidecode("\u5317\u4EB0")
-    "Bei Jing "
-
-    See unidecode_expect_ascii.
+    Returns:
+        str: The ASCII representation of the character.
    """
-
-    return _unidecode(string, errors, replace_str)
-
-unidecode = unidecode_expect_ascii
-
-def _get_repl_str(char: str) -> Optional[str]:
    codepoint = ord(char)

+    # If the character is ASCII, return it as is
    if codepoint < 0x80:
-        # Already ASCII
        return str(char)

+    # Ignore characters outside the BMP (Basic Multilingual Plane)
    if codepoint > 0xeffff:
-        # No data on characters in Private Use Area and above.
        return None

+    # Warn about surrogate characters
    if 0xd800 <= codepoint <= 0xdfff:
-        warnings.warn(  "Surrogate character %r will be ignored. "
-                        "You might be using a narrow Python build." % (char,),
-                        RuntimeWarning, 2)
+        logging.warning("Surrogate character %r will be ignored. "
+                        "You might be using a narrow Python build.", char)

-    section = codepoint >> 8   # Chop off the last two hex digits
-    position = codepoint % 256 # Last two hex digits
+    # Calculate section and position
+    section = codepoint >> 8
+    position = codepoint % 256 

    try:
+        # Look up the character in the cache
        table = Cache[section]
    except KeyError:
        try:
-            mod = __import__('unidecode.x%03x'%(section), globals(), locals(), ['data'])
+            # Import the module corresponding to the section
+            module_name = f"x{section:03x}.py"
+            main = os.path.abspath(os.path.dirname(__file__))
+            module_path = os.path.join(main, module_name)
+            spec = importlib.util.spec_from_file_location(module_name, module_path)
+            module = importlib.util.module_from_spec(spec)
+            spec.loader.exec_module(module)
        except ImportError:
-            # No data on this character
+            # If module import fails, set cache entry to None and return
            Cache[section] = None
            return None

-        Cache[section] = table = mod.data
+        # Update cache with module data
+        Cache[section] = table = module.data

+    # Return the ASCII representation if found, otherwise None
    if table and len(table) > position:
        return table[position]
    else:
        return None

-def _unidecode(string: str, errors: str, replace_str:str) -> str:
+def _transliterate(string: str, errors: str, replace_str: str) -> str:
+    """Main transliteration function.
+
+    Args:
+        string (str): The input string.
+        errors (str): Specifies the treatment of errors. Can be 'ignore', 'strict', 'replace', or 'preserve'.
+        replace_str (str): The replacement string used when errors='replace'.
+
+    Returns:
+        str: The transliterated string.
+    """
    retval = []

-    for index, char in enumerate(string):
-        repl = _get_repl_str(char)
+    for char in string:
+        # Get the ASCII representation of the character
+        ascii_char = _get_ascii_representation(char)

-        if repl is None:
+        if ascii_char is None:
+            # Handle errors based on the specified policy
            if errors == 'ignore':
-                repl = ''
+                ascii_char = ''
            elif errors == 'strict':
-                raise UnidecodeError('no replacement found for character %r '
-                        'in position %d' % (char, index), index)
+                logging.error(f'No replacement found for character {char!r}')
+                raise UnidecodeError(f'no replacement found for character {char!r}')
            elif errors == 'replace':
-                repl = replace_str
+                ascii_char = replace_str
            elif errors == 'preserve':
-                repl = char
+                ascii_char = char
            else:
-                raise UnidecodeError('invalid value for errors parameter %r' % (errors,))
+                logging.error(f'Invalid value for errors parameter {errors!r}')
+                raise UnidecodeError(f'invalid value for errors parameter {errors!r}')

-        retval.append(repl)
+        # Append the ASCII representation to the result
+        retval.append(ascii_char)

    return ''.join(retval)
+
+def transliterate_expect_ascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str:
+    """Transliterates non-ASCII characters in a string, expecting ASCII input.
+
+    Args:
+        string (str): The input string containing non-ASCII characters.
+        errors (str): Specifies the treatment of errors. Can be 'ignore', 'strict', 'replace', or 'preserve'.
+        replace_str (str): The replacement string used when errors='replace'.
+
+    Returns:
+        str: The transliterated string with non-ASCII characters replaced.
+    """
+    try:
+        # Check if the string can be encoded as ASCII
+        string.encode('ASCII')
+    except UnicodeEncodeError:
+        # If encoding fails, fall back to transliteration
+        pass
+    else:
+        # If the string is already ASCII, return it as is
+        return string
+
+    # Otherwise, transliterate non-ASCII characters
+    return _transliterate(string, errors, replace_str)
+
+transliterate = transliterate_expect_ascii
--- a/Src/Lib/Unidecode/x000.py
+++ b/Src/Lib/Unidecode/x000.py
@ -42,95 +42,95 @@ data = (
 '',    # 0x9d
 '',    # 0x9e
 '',    # 0x9f
-' ',   # 0xa0
-'!',   # 0xa1
-'C/',  # 0xa2
+' ',    # 0xa0
+'!',    # 0xa1
+'C/',    # 0xa2

 # Not "GBP" - Pound Sign is used for more than just British Pounds.
 'PS',    # 0xa3

 '$?',    # 0xa4
 'Y=',    # 0xa5
-'|',     # 0xa6
+'|',    # 0xa6
 'SS',    # 0xa7
-'"',     # 0xa8
-'(c)',   # 0xa9
-'a',     # 0xaa
+'"',    # 0xa8
+'(c)',    # 0xa9
+'a',    # 0xaa
 '<<',    # 0xab
-'!',     # 0xac
-'',      # 0xad
-'(r)',   # 0xae
-'-',     # 0xaf
-'deg',   # 0xb0
+'!',    # 0xac
+'',    # 0xad
+'(r)',    # 0xae
+'-',    # 0xaf
+'deg',    # 0xb0
 '+-',    # 0xb1

 # These might be combined with other superscript digits (u+2070 - u+2079)
-'2',     # 0xb2
-'3',     # 0xb3
+'2',    # 0xb2
+'3',    # 0xb3

 '\'',    # 0xb4
-'u',     # 0xb5
-'P',     # 0xb6
-'*',     # 0xb7
-',',     # 0xb8
-'1',     # 0xb9
-'o',     # 0xba
+'u',    # 0xb5
+'P',    # 0xb6
+'*',    # 0xb7
+',',    # 0xb8
+'1',    # 0xb9
+'o',    # 0xba
 '>>',    # 0xbb
-' 1/4',  # 0xbc
-' 1/2',  # 0xbd
-' 3/4',  # 0xbe
-'?',     # 0xbf
-'A',     # 0xc0
-'A',     # 0xc1
-'A',     # 0xc2
-'A',     # 0xc3
+' 1/4',    # 0xbc
+' 1/2',    # 0xbd
+' 3/4',    # 0xbe
+'?',    # 0xbf
+'A',    # 0xc0
+'A',    # 0xc1
+'A',    # 0xc2
+'A',    # 0xc3

 # Not "AE" - used in languages other than German
 'A',    # 0xc4

-'A',     # 0xc5
+'A',    # 0xc5
 'AE',    # 0xc6
-'C',     # 0xc7
-'E',     # 0xc8
-'E',     # 0xc9
-'E',     # 0xca
-'E',     # 0xcb
-'I',     # 0xcc
-'I',     # 0xcd
-'I',     # 0xce
-'I',     # 0xcf
-'D',     # 0xd0
-'N',     # 0xd1
-'O',     # 0xd2
-'O',     # 0xd3
-'O',     # 0xd4
-'O',     # 0xd5
+'C',    # 0xc7
+'E',    # 0xc8
+'E',    # 0xc9
+'E',    # 0xca
+'E',    # 0xcb
+'I',    # 0xcc
+'I',    # 0xcd
+'I',    # 0xce
+'I',    # 0xcf
+'D',    # 0xd0
+'N',    # 0xd1
+'O',    # 0xd2
+'O',    # 0xd3
+'O',    # 0xd4
+'O',    # 0xd5

 # Not "OE" - used in languages other than German
-'O',     # 0xd6
+'O',    # 0xd6

-'x',     # 0xd7
-'O',     # 0xd8
-'U',     # 0xd9
-'U',     # 0xda
-'U',     # 0xdb
+'x',    # 0xd7
+'O',    # 0xd8
+'U',    # 0xd9
+'U',    # 0xda
+'U',    # 0xdb

 # Not "UE" - used in languages other than German
-'U',     # 0xdc
+'U',    # 0xdc

-'Y',     # 0xdd
+'Y',    # 0xdd
 'Th',    # 0xde
 'ss',    # 0xdf
-'a',     # 0xe0
-'a',     # 0xe1
-'a',     # 0xe2
-'a',     # 0xe3
+'a',    # 0xe0
+'a',    # 0xe1
+'a',    # 0xe2
+'a',    # 0xe3

 # Not "ae" - used in languages other than German
 'a',    # 0xe4

 'a',    # 0xe5
-'ae',   # 0xe6
+'ae',    # 0xe6
 'c',    # 0xe7
 'e',    # 0xe8
 'e',    # 0xe9
@ -160,6 +160,6 @@ data = (
 'u',    # 0xfc

 'y',    # 0xfd
-'th',   # 0xfe
+'th',    # 0xfe
 'y',    # 0xff
 )