mirror of
https://github.com/Arrowar/StreamingCommunity.git
synced 2025-06-07 20:15:24 +00:00
Fix unidecode
This commit is contained in:
parent
9d1b7536b7
commit
3a99ca2afe
@ -5,7 +5,7 @@ from Src.Util.console import console, msg
|
|||||||
from Src.Util.config import config_manager
|
from Src.Util.config import config_manager
|
||||||
from Src.Util.table import TVShowManager
|
from Src.Util.table import TVShowManager
|
||||||
from Src.Util.message import start_message
|
from Src.Util.message import start_message
|
||||||
from Src.Lib.Unidecode import unidecode
|
from Src.Lib.Unidecode import transliterate
|
||||||
from Src.Lib.FFmpeg.my_m3u8 import Downloader
|
from Src.Lib.FFmpeg.my_m3u8 import Downloader
|
||||||
from .Class import VideoSource
|
from .Class import VideoSource
|
||||||
|
|
||||||
@ -110,7 +110,7 @@ def donwload_video(tv_name: str, index_season_selected: int, index_episode_selec
|
|||||||
episode_id = video_source.obj_episode_manager.episodes[index_episode_selected - 1].id
|
episode_id = video_source.obj_episode_manager.episodes[index_episode_selected - 1].id
|
||||||
|
|
||||||
# Define filename and path for the downloaded video
|
# Define filename and path for the downloaded video
|
||||||
mp4_name = f"{index_episode_selected}_{unidecode(video_source.obj_episode_manager.episodes[index_episode_selected - 1].name)}.mp4"
|
mp4_name = f"{index_episode_selected}_{transliterate(video_source.obj_episode_manager.episodes[index_episode_selected - 1].name)}.mp4"
|
||||||
mp4_path = os.path.join(ROOT_PATH, SERIES_FOLDER, tv_name, f"S{index_season_selected}")
|
mp4_path = os.path.join(ROOT_PATH, SERIES_FOLDER, tv_name, f"S{index_season_selected}")
|
||||||
os.makedirs(mp4_path, exist_ok=True)
|
os.makedirs(mp4_path, exist_ok=True)
|
||||||
|
|
||||||
|
@ -1,138 +1,143 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# 04.04.24
|
||||||
# vi:tabstop=4:expandtab:sw=4
|
|
||||||
"""Transliterate Unicode text into plain 7-bit ASCII.
|
|
||||||
|
|
||||||
Example usage:
|
|
||||||
|
|
||||||
>>> from unidecode import unidecode
|
# Import
|
||||||
>>> unidecode("\u5317\u4EB0")
|
import os
|
||||||
"Bei Jing "
|
import logging
|
||||||
|
import importlib.util
|
||||||
|
|
||||||
The transliteration uses a straightforward map, and doesn't have alternatives
|
# Variable
|
||||||
for the same character based on language, position, or anything else.
|
Cache = {}
|
||||||
|
|
||||||
A standard string object will be returned. If you need bytes, use:
|
|
||||||
|
|
||||||
>>> unidecode("Κνωσός").encode("ascii")
|
|
||||||
b'Knosos'
|
|
||||||
"""
|
|
||||||
import warnings
|
|
||||||
from typing import Dict, Optional, Sequence
|
|
||||||
|
|
||||||
Cache = {} # type: Dict[int, Optional[Sequence[Optional[str]]]]
|
|
||||||
|
|
||||||
class UnidecodeError(ValueError):
|
class UnidecodeError(ValueError):
|
||||||
def __init__(self, message: str, index: Optional[int] = None) -> None:
|
pass
|
||||||
"""Raised for Unidecode-related errors.
|
|
||||||
|
|
||||||
The index attribute contains the index of the character that caused
|
def transliterate_nonascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str:
|
||||||
the error.
|
"""Transliterates non-ASCII characters in a string to their ASCII counterparts.
|
||||||
"""
|
|
||||||
super(UnidecodeError, self).__init__(message)
|
|
||||||
self.index = index
|
|
||||||
|
|
||||||
|
Args:
|
||||||
|
string (str): The input string containing non-ASCII characters.
|
||||||
|
errors (str): Specifies the treatment of errors. Can be 'ignore', 'strict', 'replace', or 'preserve'.
|
||||||
|
replace_str (str): The replacement string used when errors='replace'.
|
||||||
|
|
||||||
def unidecode_expect_ascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str:
|
Returns:
|
||||||
"""Transliterate an Unicode object into an ASCII string
|
str: The transliterated string with non-ASCII characters replaced.
|
||||||
|
|
||||||
>>> unidecode("\u5317\u4EB0")
|
|
||||||
"Bei Jing "
|
|
||||||
|
|
||||||
This function first tries to convert the string using ASCII codec.
|
|
||||||
If it fails (because of non-ASCII characters), it falls back to
|
|
||||||
transliteration using the character tables.
|
|
||||||
|
|
||||||
This is approx. five times faster if the string only contains ASCII
|
|
||||||
characters, but slightly slower than unicode_expect_nonascii if
|
|
||||||
non-ASCII characters are present.
|
|
||||||
|
|
||||||
errors specifies what to do with characters that have not been
|
|
||||||
found in replacement tables. The default is 'ignore' which ignores
|
|
||||||
the character. 'strict' raises an UnidecodeError. 'replace'
|
|
||||||
substitutes the character with replace_str (default is '?').
|
|
||||||
'preserve' keeps the original character.
|
|
||||||
|
|
||||||
Note that if 'preserve' is used the returned string might not be
|
|
||||||
ASCII!
|
|
||||||
"""
|
"""
|
||||||
|
return _transliterate(string, errors, replace_str)
|
||||||
|
|
||||||
try:
|
def _get_ascii_representation(char: str) -> str:
|
||||||
bytestring = string.encode('ASCII')
|
"""Obtains the ASCII representation of a Unicode character.
|
||||||
except UnicodeEncodeError:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
return string
|
|
||||||
|
|
||||||
return _unidecode(string, errors, replace_str)
|
Args:
|
||||||
|
char (str): The Unicode character.
|
||||||
|
|
||||||
def unidecode_expect_nonascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str:
|
Returns:
|
||||||
"""Transliterate an Unicode object into an ASCII string
|
str: The ASCII representation of the character.
|
||||||
|
|
||||||
>>> unidecode("\u5317\u4EB0")
|
|
||||||
"Bei Jing "
|
|
||||||
|
|
||||||
See unidecode_expect_ascii.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
return _unidecode(string, errors, replace_str)
|
|
||||||
|
|
||||||
unidecode = unidecode_expect_ascii
|
|
||||||
|
|
||||||
def _get_repl_str(char: str) -> Optional[str]:
|
|
||||||
codepoint = ord(char)
|
codepoint = ord(char)
|
||||||
|
|
||||||
|
# If the character is ASCII, return it as is
|
||||||
if codepoint < 0x80:
|
if codepoint < 0x80:
|
||||||
# Already ASCII
|
|
||||||
return str(char)
|
return str(char)
|
||||||
|
|
||||||
|
# Ignore characters outside the BMP (Basic Multilingual Plane)
|
||||||
if codepoint > 0xeffff:
|
if codepoint > 0xeffff:
|
||||||
# No data on characters in Private Use Area and above.
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
# Warn about surrogate characters
|
||||||
if 0xd800 <= codepoint <= 0xdfff:
|
if 0xd800 <= codepoint <= 0xdfff:
|
||||||
warnings.warn( "Surrogate character %r will be ignored. "
|
logging.warning("Surrogate character %r will be ignored. "
|
||||||
"You might be using a narrow Python build." % (char,),
|
"You might be using a narrow Python build.", char)
|
||||||
RuntimeWarning, 2)
|
|
||||||
|
|
||||||
section = codepoint >> 8 # Chop off the last two hex digits
|
# Calculate section and position
|
||||||
position = codepoint % 256 # Last two hex digits
|
section = codepoint >> 8
|
||||||
|
position = codepoint % 256
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
# Look up the character in the cache
|
||||||
table = Cache[section]
|
table = Cache[section]
|
||||||
except KeyError:
|
except KeyError:
|
||||||
try:
|
try:
|
||||||
mod = __import__('unidecode.x%03x'%(section), globals(), locals(), ['data'])
|
# Import the module corresponding to the section
|
||||||
|
module_name = f"x{section:03x}.py"
|
||||||
|
main = os.path.abspath(os.path.dirname(__file__))
|
||||||
|
module_path = os.path.join(main, module_name)
|
||||||
|
spec = importlib.util.spec_from_file_location(module_name, module_path)
|
||||||
|
module = importlib.util.module_from_spec(spec)
|
||||||
|
spec.loader.exec_module(module)
|
||||||
except ImportError:
|
except ImportError:
|
||||||
# No data on this character
|
# If module import fails, set cache entry to None and return
|
||||||
Cache[section] = None
|
Cache[section] = None
|
||||||
return None
|
return None
|
||||||
|
|
||||||
Cache[section] = table = mod.data
|
# Update cache with module data
|
||||||
|
Cache[section] = table = module.data
|
||||||
|
|
||||||
|
# Return the ASCII representation if found, otherwise None
|
||||||
if table and len(table) > position:
|
if table and len(table) > position:
|
||||||
return table[position]
|
return table[position]
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _unidecode(string: str, errors: str, replace_str:str) -> str:
|
def _transliterate(string: str, errors: str, replace_str: str) -> str:
|
||||||
|
"""Main transliteration function.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
string (str): The input string.
|
||||||
|
errors (str): Specifies the treatment of errors. Can be 'ignore', 'strict', 'replace', or 'preserve'.
|
||||||
|
replace_str (str): The replacement string used when errors='replace'.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The transliterated string.
|
||||||
|
"""
|
||||||
retval = []
|
retval = []
|
||||||
|
|
||||||
for index, char in enumerate(string):
|
for char in string:
|
||||||
repl = _get_repl_str(char)
|
# Get the ASCII representation of the character
|
||||||
|
ascii_char = _get_ascii_representation(char)
|
||||||
|
|
||||||
if repl is None:
|
if ascii_char is None:
|
||||||
|
# Handle errors based on the specified policy
|
||||||
if errors == 'ignore':
|
if errors == 'ignore':
|
||||||
repl = ''
|
ascii_char = ''
|
||||||
elif errors == 'strict':
|
elif errors == 'strict':
|
||||||
raise UnidecodeError('no replacement found for character %r '
|
logging.error(f'No replacement found for character {char!r}')
|
||||||
'in position %d' % (char, index), index)
|
raise UnidecodeError(f'no replacement found for character {char!r}')
|
||||||
elif errors == 'replace':
|
elif errors == 'replace':
|
||||||
repl = replace_str
|
ascii_char = replace_str
|
||||||
elif errors == 'preserve':
|
elif errors == 'preserve':
|
||||||
repl = char
|
ascii_char = char
|
||||||
else:
|
else:
|
||||||
raise UnidecodeError('invalid value for errors parameter %r' % (errors,))
|
logging.error(f'Invalid value for errors parameter {errors!r}')
|
||||||
|
raise UnidecodeError(f'invalid value for errors parameter {errors!r}')
|
||||||
|
|
||||||
retval.append(repl)
|
# Append the ASCII representation to the result
|
||||||
|
retval.append(ascii_char)
|
||||||
|
|
||||||
return ''.join(retval)
|
return ''.join(retval)
|
||||||
|
|
||||||
|
def transliterate_expect_ascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str:
|
||||||
|
"""Transliterates non-ASCII characters in a string, expecting ASCII input.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
string (str): The input string containing non-ASCII characters.
|
||||||
|
errors (str): Specifies the treatment of errors. Can be 'ignore', 'strict', 'replace', or 'preserve'.
|
||||||
|
replace_str (str): The replacement string used when errors='replace'.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The transliterated string with non-ASCII characters replaced.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Check if the string can be encoded as ASCII
|
||||||
|
string.encode('ASCII')
|
||||||
|
except UnicodeEncodeError:
|
||||||
|
# If encoding fails, fall back to transliteration
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
# If the string is already ASCII, return it as is
|
||||||
|
return string
|
||||||
|
|
||||||
|
# Otherwise, transliterate non-ASCII characters
|
||||||
|
return _transliterate(string, errors, replace_str)
|
||||||
|
|
||||||
|
transliterate = transliterate_expect_ascii
|
||||||
|
@ -42,95 +42,95 @@ data = (
|
|||||||
'', # 0x9d
|
'', # 0x9d
|
||||||
'', # 0x9e
|
'', # 0x9e
|
||||||
'', # 0x9f
|
'', # 0x9f
|
||||||
' ', # 0xa0
|
' ', # 0xa0
|
||||||
'!', # 0xa1
|
'!', # 0xa1
|
||||||
'C/', # 0xa2
|
'C/', # 0xa2
|
||||||
|
|
||||||
# Not "GBP" - Pound Sign is used for more than just British Pounds.
|
# Not "GBP" - Pound Sign is used for more than just British Pounds.
|
||||||
'PS', # 0xa3
|
'PS', # 0xa3
|
||||||
|
|
||||||
'$?', # 0xa4
|
'$?', # 0xa4
|
||||||
'Y=', # 0xa5
|
'Y=', # 0xa5
|
||||||
'|', # 0xa6
|
'|', # 0xa6
|
||||||
'SS', # 0xa7
|
'SS', # 0xa7
|
||||||
'"', # 0xa8
|
'"', # 0xa8
|
||||||
'(c)', # 0xa9
|
'(c)', # 0xa9
|
||||||
'a', # 0xaa
|
'a', # 0xaa
|
||||||
'<<', # 0xab
|
'<<', # 0xab
|
||||||
'!', # 0xac
|
'!', # 0xac
|
||||||
'', # 0xad
|
'', # 0xad
|
||||||
'(r)', # 0xae
|
'(r)', # 0xae
|
||||||
'-', # 0xaf
|
'-', # 0xaf
|
||||||
'deg', # 0xb0
|
'deg', # 0xb0
|
||||||
'+-', # 0xb1
|
'+-', # 0xb1
|
||||||
|
|
||||||
# These might be combined with other superscript digits (u+2070 - u+2079)
|
# These might be combined with other superscript digits (u+2070 - u+2079)
|
||||||
'2', # 0xb2
|
'2', # 0xb2
|
||||||
'3', # 0xb3
|
'3', # 0xb3
|
||||||
|
|
||||||
'\'', # 0xb4
|
'\'', # 0xb4
|
||||||
'u', # 0xb5
|
'u', # 0xb5
|
||||||
'P', # 0xb6
|
'P', # 0xb6
|
||||||
'*', # 0xb7
|
'*', # 0xb7
|
||||||
',', # 0xb8
|
',', # 0xb8
|
||||||
'1', # 0xb9
|
'1', # 0xb9
|
||||||
'o', # 0xba
|
'o', # 0xba
|
||||||
'>>', # 0xbb
|
'>>', # 0xbb
|
||||||
' 1/4', # 0xbc
|
' 1/4', # 0xbc
|
||||||
' 1/2', # 0xbd
|
' 1/2', # 0xbd
|
||||||
' 3/4', # 0xbe
|
' 3/4', # 0xbe
|
||||||
'?', # 0xbf
|
'?', # 0xbf
|
||||||
'A', # 0xc0
|
'A', # 0xc0
|
||||||
'A', # 0xc1
|
'A', # 0xc1
|
||||||
'A', # 0xc2
|
'A', # 0xc2
|
||||||
'A', # 0xc3
|
'A', # 0xc3
|
||||||
|
|
||||||
# Not "AE" - used in languages other than German
|
# Not "AE" - used in languages other than German
|
||||||
'A', # 0xc4
|
'A', # 0xc4
|
||||||
|
|
||||||
'A', # 0xc5
|
'A', # 0xc5
|
||||||
'AE', # 0xc6
|
'AE', # 0xc6
|
||||||
'C', # 0xc7
|
'C', # 0xc7
|
||||||
'E', # 0xc8
|
'E', # 0xc8
|
||||||
'E', # 0xc9
|
'E', # 0xc9
|
||||||
'E', # 0xca
|
'E', # 0xca
|
||||||
'E', # 0xcb
|
'E', # 0xcb
|
||||||
'I', # 0xcc
|
'I', # 0xcc
|
||||||
'I', # 0xcd
|
'I', # 0xcd
|
||||||
'I', # 0xce
|
'I', # 0xce
|
||||||
'I', # 0xcf
|
'I', # 0xcf
|
||||||
'D', # 0xd0
|
'D', # 0xd0
|
||||||
'N', # 0xd1
|
'N', # 0xd1
|
||||||
'O', # 0xd2
|
'O', # 0xd2
|
||||||
'O', # 0xd3
|
'O', # 0xd3
|
||||||
'O', # 0xd4
|
'O', # 0xd4
|
||||||
'O', # 0xd5
|
'O', # 0xd5
|
||||||
|
|
||||||
# Not "OE" - used in languages other than German
|
# Not "OE" - used in languages other than German
|
||||||
'O', # 0xd6
|
'O', # 0xd6
|
||||||
|
|
||||||
'x', # 0xd7
|
'x', # 0xd7
|
||||||
'O', # 0xd8
|
'O', # 0xd8
|
||||||
'U', # 0xd9
|
'U', # 0xd9
|
||||||
'U', # 0xda
|
'U', # 0xda
|
||||||
'U', # 0xdb
|
'U', # 0xdb
|
||||||
|
|
||||||
# Not "UE" - used in languages other than German
|
# Not "UE" - used in languages other than German
|
||||||
'U', # 0xdc
|
'U', # 0xdc
|
||||||
|
|
||||||
'Y', # 0xdd
|
'Y', # 0xdd
|
||||||
'Th', # 0xde
|
'Th', # 0xde
|
||||||
'ss', # 0xdf
|
'ss', # 0xdf
|
||||||
'a', # 0xe0
|
'a', # 0xe0
|
||||||
'a', # 0xe1
|
'a', # 0xe1
|
||||||
'a', # 0xe2
|
'a', # 0xe2
|
||||||
'a', # 0xe3
|
'a', # 0xe3
|
||||||
|
|
||||||
# Not "ae" - used in languages other than German
|
# Not "ae" - used in languages other than German
|
||||||
'a', # 0xe4
|
'a', # 0xe4
|
||||||
|
|
||||||
'a', # 0xe5
|
'a', # 0xe5
|
||||||
'ae', # 0xe6
|
'ae', # 0xe6
|
||||||
'c', # 0xe7
|
'c', # 0xe7
|
||||||
'e', # 0xe8
|
'e', # 0xe8
|
||||||
'e', # 0xe9
|
'e', # 0xe9
|
||||||
@ -160,6 +160,6 @@ data = (
|
|||||||
'u', # 0xfc
|
'u', # 0xfc
|
||||||
|
|
||||||
'y', # 0xfd
|
'y', # 0xfd
|
||||||
'th', # 0xfe
|
'th', # 0xfe
|
||||||
'y', # 0xff
|
'y', # 0xff
|
||||||
)
|
)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user