mirror of
https://github.com/Arrowar/StreamingCommunity.git
synced 2025-06-06 19:45:24 +00:00
Fix unidecode
This commit is contained in:
parent
9d1b7536b7
commit
3a99ca2afe
@ -5,7 +5,7 @@ from Src.Util.console import console, msg
|
||||
from Src.Util.config import config_manager
|
||||
from Src.Util.table import TVShowManager
|
||||
from Src.Util.message import start_message
|
||||
from Src.Lib.Unidecode import unidecode
|
||||
from Src.Lib.Unidecode import transliterate
|
||||
from Src.Lib.FFmpeg.my_m3u8 import Downloader
|
||||
from .Class import VideoSource
|
||||
|
||||
@ -110,7 +110,7 @@ def donwload_video(tv_name: str, index_season_selected: int, index_episode_selec
|
||||
episode_id = video_source.obj_episode_manager.episodes[index_episode_selected - 1].id
|
||||
|
||||
# Define filename and path for the downloaded video
|
||||
mp4_name = f"{index_episode_selected}_{unidecode(video_source.obj_episode_manager.episodes[index_episode_selected - 1].name)}.mp4"
|
||||
mp4_name = f"{index_episode_selected}_{transliterate(video_source.obj_episode_manager.episodes[index_episode_selected - 1].name)}.mp4"
|
||||
mp4_path = os.path.join(ROOT_PATH, SERIES_FOLDER, tv_name, f"S{index_season_selected}")
|
||||
os.makedirs(mp4_path, exist_ok=True)
|
||||
|
||||
|
@ -1,138 +1,143 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# vi:tabstop=4:expandtab:sw=4
|
||||
"""Transliterate Unicode text into plain 7-bit ASCII.
|
||||
# 04.04.24
|
||||
|
||||
Example usage:
|
||||
|
||||
>>> from unidecode import unidecode
|
||||
>>> unidecode("\u5317\u4EB0")
|
||||
"Bei Jing "
|
||||
# Import
|
||||
import os
|
||||
import logging
|
||||
import importlib.util
|
||||
|
||||
The transliteration uses a straightforward map, and doesn't have alternatives
|
||||
for the same character based on language, position, or anything else.
|
||||
|
||||
A standard string object will be returned. If you need bytes, use:
|
||||
|
||||
>>> unidecode("Κνωσός").encode("ascii")
|
||||
b'Knosos'
|
||||
"""
|
||||
import warnings
|
||||
from typing import Dict, Optional, Sequence
|
||||
|
||||
Cache = {} # type: Dict[int, Optional[Sequence[Optional[str]]]]
|
||||
# Variable
|
||||
Cache = {}
|
||||
|
||||
class UnidecodeError(ValueError):
|
||||
def __init__(self, message: str, index: Optional[int] = None) -> None:
|
||||
"""Raised for Unidecode-related errors.
|
||||
pass
|
||||
|
||||
The index attribute contains the index of the character that caused
|
||||
the error.
|
||||
"""
|
||||
super(UnidecodeError, self).__init__(message)
|
||||
self.index = index
|
||||
def transliterate_nonascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str:
|
||||
"""Transliterates non-ASCII characters in a string to their ASCII counterparts.
|
||||
|
||||
Args:
|
||||
string (str): The input string containing non-ASCII characters.
|
||||
errors (str): Specifies the treatment of errors. Can be 'ignore', 'strict', 'replace', or 'preserve'.
|
||||
replace_str (str): The replacement string used when errors='replace'.
|
||||
|
||||
def unidecode_expect_ascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str:
|
||||
"""Transliterate an Unicode object into an ASCII string
|
||||
|
||||
>>> unidecode("\u5317\u4EB0")
|
||||
"Bei Jing "
|
||||
|
||||
This function first tries to convert the string using ASCII codec.
|
||||
If it fails (because of non-ASCII characters), it falls back to
|
||||
transliteration using the character tables.
|
||||
|
||||
This is approx. five times faster if the string only contains ASCII
|
||||
characters, but slightly slower than unicode_expect_nonascii if
|
||||
non-ASCII characters are present.
|
||||
|
||||
errors specifies what to do with characters that have not been
|
||||
found in replacement tables. The default is 'ignore' which ignores
|
||||
the character. 'strict' raises an UnidecodeError. 'replace'
|
||||
substitutes the character with replace_str (default is '?').
|
||||
'preserve' keeps the original character.
|
||||
|
||||
Note that if 'preserve' is used the returned string might not be
|
||||
ASCII!
|
||||
Returns:
|
||||
str: The transliterated string with non-ASCII characters replaced.
|
||||
"""
|
||||
return _transliterate(string, errors, replace_str)
|
||||
|
||||
try:
|
||||
bytestring = string.encode('ASCII')
|
||||
except UnicodeEncodeError:
|
||||
pass
|
||||
else:
|
||||
return string
|
||||
def _get_ascii_representation(char: str) -> str:
|
||||
"""Obtains the ASCII representation of a Unicode character.
|
||||
|
||||
return _unidecode(string, errors, replace_str)
|
||||
Args:
|
||||
char (str): The Unicode character.
|
||||
|
||||
def unidecode_expect_nonascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str:
|
||||
"""Transliterate an Unicode object into an ASCII string
|
||||
|
||||
>>> unidecode("\u5317\u4EB0")
|
||||
"Bei Jing "
|
||||
|
||||
See unidecode_expect_ascii.
|
||||
Returns:
|
||||
str: The ASCII representation of the character.
|
||||
"""
|
||||
|
||||
return _unidecode(string, errors, replace_str)
|
||||
|
||||
unidecode = unidecode_expect_ascii
|
||||
|
||||
def _get_repl_str(char: str) -> Optional[str]:
|
||||
codepoint = ord(char)
|
||||
|
||||
# If the character is ASCII, return it as is
|
||||
if codepoint < 0x80:
|
||||
# Already ASCII
|
||||
return str(char)
|
||||
|
||||
# Ignore characters outside the BMP (Basic Multilingual Plane)
|
||||
if codepoint > 0xeffff:
|
||||
# No data on characters in Private Use Area and above.
|
||||
return None
|
||||
|
||||
# Warn about surrogate characters
|
||||
if 0xd800 <= codepoint <= 0xdfff:
|
||||
warnings.warn( "Surrogate character %r will be ignored. "
|
||||
"You might be using a narrow Python build." % (char,),
|
||||
RuntimeWarning, 2)
|
||||
logging.warning("Surrogate character %r will be ignored. "
|
||||
"You might be using a narrow Python build.", char)
|
||||
|
||||
section = codepoint >> 8 # Chop off the last two hex digits
|
||||
position = codepoint % 256 # Last two hex digits
|
||||
# Calculate section and position
|
||||
section = codepoint >> 8
|
||||
position = codepoint % 256
|
||||
|
||||
try:
|
||||
# Look up the character in the cache
|
||||
table = Cache[section]
|
||||
except KeyError:
|
||||
try:
|
||||
mod = __import__('unidecode.x%03x'%(section), globals(), locals(), ['data'])
|
||||
# Import the module corresponding to the section
|
||||
module_name = f"x{section:03x}.py"
|
||||
main = os.path.abspath(os.path.dirname(__file__))
|
||||
module_path = os.path.join(main, module_name)
|
||||
spec = importlib.util.spec_from_file_location(module_name, module_path)
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(module)
|
||||
except ImportError:
|
||||
# No data on this character
|
||||
# If module import fails, set cache entry to None and return
|
||||
Cache[section] = None
|
||||
return None
|
||||
|
||||
Cache[section] = table = mod.data
|
||||
# Update cache with module data
|
||||
Cache[section] = table = module.data
|
||||
|
||||
# Return the ASCII representation if found, otherwise None
|
||||
if table and len(table) > position:
|
||||
return table[position]
|
||||
else:
|
||||
return None
|
||||
|
||||
def _unidecode(string: str, errors: str, replace_str:str) -> str:
|
||||
def _transliterate(string: str, errors: str, replace_str: str) -> str:
|
||||
"""Main transliteration function.
|
||||
|
||||
Args:
|
||||
string (str): The input string.
|
||||
errors (str): Specifies the treatment of errors. Can be 'ignore', 'strict', 'replace', or 'preserve'.
|
||||
replace_str (str): The replacement string used when errors='replace'.
|
||||
|
||||
Returns:
|
||||
str: The transliterated string.
|
||||
"""
|
||||
retval = []
|
||||
|
||||
for index, char in enumerate(string):
|
||||
repl = _get_repl_str(char)
|
||||
for char in string:
|
||||
# Get the ASCII representation of the character
|
||||
ascii_char = _get_ascii_representation(char)
|
||||
|
||||
if repl is None:
|
||||
if ascii_char is None:
|
||||
# Handle errors based on the specified policy
|
||||
if errors == 'ignore':
|
||||
repl = ''
|
||||
ascii_char = ''
|
||||
elif errors == 'strict':
|
||||
raise UnidecodeError('no replacement found for character %r '
|
||||
'in position %d' % (char, index), index)
|
||||
logging.error(f'No replacement found for character {char!r}')
|
||||
raise UnidecodeError(f'no replacement found for character {char!r}')
|
||||
elif errors == 'replace':
|
||||
repl = replace_str
|
||||
ascii_char = replace_str
|
||||
elif errors == 'preserve':
|
||||
repl = char
|
||||
ascii_char = char
|
||||
else:
|
||||
raise UnidecodeError('invalid value for errors parameter %r' % (errors,))
|
||||
logging.error(f'Invalid value for errors parameter {errors!r}')
|
||||
raise UnidecodeError(f'invalid value for errors parameter {errors!r}')
|
||||
|
||||
retval.append(repl)
|
||||
# Append the ASCII representation to the result
|
||||
retval.append(ascii_char)
|
||||
|
||||
return ''.join(retval)
|
||||
|
||||
def transliterate_expect_ascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str:
|
||||
"""Transliterates non-ASCII characters in a string, expecting ASCII input.
|
||||
|
||||
Args:
|
||||
string (str): The input string containing non-ASCII characters.
|
||||
errors (str): Specifies the treatment of errors. Can be 'ignore', 'strict', 'replace', or 'preserve'.
|
||||
replace_str (str): The replacement string used when errors='replace'.
|
||||
|
||||
Returns:
|
||||
str: The transliterated string with non-ASCII characters replaced.
|
||||
"""
|
||||
try:
|
||||
# Check if the string can be encoded as ASCII
|
||||
string.encode('ASCII')
|
||||
except UnicodeEncodeError:
|
||||
# If encoding fails, fall back to transliteration
|
||||
pass
|
||||
else:
|
||||
# If the string is already ASCII, return it as is
|
||||
return string
|
||||
|
||||
# Otherwise, transliterate non-ASCII characters
|
||||
return _transliterate(string, errors, replace_str)
|
||||
|
||||
transliterate = transliterate_expect_ascii
|
||||
|
@ -42,95 +42,95 @@ data = (
|
||||
'', # 0x9d
|
||||
'', # 0x9e
|
||||
'', # 0x9f
|
||||
' ', # 0xa0
|
||||
'!', # 0xa1
|
||||
'C/', # 0xa2
|
||||
' ', # 0xa0
|
||||
'!', # 0xa1
|
||||
'C/', # 0xa2
|
||||
|
||||
# Not "GBP" - Pound Sign is used for more than just British Pounds.
|
||||
'PS', # 0xa3
|
||||
|
||||
'$?', # 0xa4
|
||||
'Y=', # 0xa5
|
||||
'|', # 0xa6
|
||||
'|', # 0xa6
|
||||
'SS', # 0xa7
|
||||
'"', # 0xa8
|
||||
'(c)', # 0xa9
|
||||
'a', # 0xaa
|
||||
'"', # 0xa8
|
||||
'(c)', # 0xa9
|
||||
'a', # 0xaa
|
||||
'<<', # 0xab
|
||||
'!', # 0xac
|
||||
'', # 0xad
|
||||
'(r)', # 0xae
|
||||
'-', # 0xaf
|
||||
'deg', # 0xb0
|
||||
'!', # 0xac
|
||||
'', # 0xad
|
||||
'(r)', # 0xae
|
||||
'-', # 0xaf
|
||||
'deg', # 0xb0
|
||||
'+-', # 0xb1
|
||||
|
||||
# These might be combined with other superscript digits (u+2070 - u+2079)
|
||||
'2', # 0xb2
|
||||
'3', # 0xb3
|
||||
'2', # 0xb2
|
||||
'3', # 0xb3
|
||||
|
||||
'\'', # 0xb4
|
||||
'u', # 0xb5
|
||||
'P', # 0xb6
|
||||
'*', # 0xb7
|
||||
',', # 0xb8
|
||||
'1', # 0xb9
|
||||
'o', # 0xba
|
||||
'u', # 0xb5
|
||||
'P', # 0xb6
|
||||
'*', # 0xb7
|
||||
',', # 0xb8
|
||||
'1', # 0xb9
|
||||
'o', # 0xba
|
||||
'>>', # 0xbb
|
||||
' 1/4', # 0xbc
|
||||
' 1/2', # 0xbd
|
||||
' 3/4', # 0xbe
|
||||
'?', # 0xbf
|
||||
'A', # 0xc0
|
||||
'A', # 0xc1
|
||||
'A', # 0xc2
|
||||
'A', # 0xc3
|
||||
' 1/4', # 0xbc
|
||||
' 1/2', # 0xbd
|
||||
' 3/4', # 0xbe
|
||||
'?', # 0xbf
|
||||
'A', # 0xc0
|
||||
'A', # 0xc1
|
||||
'A', # 0xc2
|
||||
'A', # 0xc3
|
||||
|
||||
# Not "AE" - used in languages other than German
|
||||
'A', # 0xc4
|
||||
|
||||
'A', # 0xc5
|
||||
'A', # 0xc5
|
||||
'AE', # 0xc6
|
||||
'C', # 0xc7
|
||||
'E', # 0xc8
|
||||
'E', # 0xc9
|
||||
'E', # 0xca
|
||||
'E', # 0xcb
|
||||
'I', # 0xcc
|
||||
'I', # 0xcd
|
||||
'I', # 0xce
|
||||
'I', # 0xcf
|
||||
'D', # 0xd0
|
||||
'N', # 0xd1
|
||||
'O', # 0xd2
|
||||
'O', # 0xd3
|
||||
'O', # 0xd4
|
||||
'O', # 0xd5
|
||||
'C', # 0xc7
|
||||
'E', # 0xc8
|
||||
'E', # 0xc9
|
||||
'E', # 0xca
|
||||
'E', # 0xcb
|
||||
'I', # 0xcc
|
||||
'I', # 0xcd
|
||||
'I', # 0xce
|
||||
'I', # 0xcf
|
||||
'D', # 0xd0
|
||||
'N', # 0xd1
|
||||
'O', # 0xd2
|
||||
'O', # 0xd3
|
||||
'O', # 0xd4
|
||||
'O', # 0xd5
|
||||
|
||||
# Not "OE" - used in languages other than German
|
||||
'O', # 0xd6
|
||||
'O', # 0xd6
|
||||
|
||||
'x', # 0xd7
|
||||
'O', # 0xd8
|
||||
'U', # 0xd9
|
||||
'U', # 0xda
|
||||
'U', # 0xdb
|
||||
'x', # 0xd7
|
||||
'O', # 0xd8
|
||||
'U', # 0xd9
|
||||
'U', # 0xda
|
||||
'U', # 0xdb
|
||||
|
||||
# Not "UE" - used in languages other than German
|
||||
'U', # 0xdc
|
||||
'U', # 0xdc
|
||||
|
||||
'Y', # 0xdd
|
||||
'Y', # 0xdd
|
||||
'Th', # 0xde
|
||||
'ss', # 0xdf
|
||||
'a', # 0xe0
|
||||
'a', # 0xe1
|
||||
'a', # 0xe2
|
||||
'a', # 0xe3
|
||||
'a', # 0xe0
|
||||
'a', # 0xe1
|
||||
'a', # 0xe2
|
||||
'a', # 0xe3
|
||||
|
||||
# Not "ae" - used in languages other than German
|
||||
'a', # 0xe4
|
||||
|
||||
'a', # 0xe5
|
||||
'ae', # 0xe6
|
||||
'ae', # 0xe6
|
||||
'c', # 0xe7
|
||||
'e', # 0xe8
|
||||
'e', # 0xe9
|
||||
@ -160,6 +160,6 @@ data = (
|
||||
'u', # 0xfc
|
||||
|
||||
'y', # 0xfd
|
||||
'th', # 0xfe
|
||||
'th', # 0xfe
|
||||
'y', # 0xff
|
||||
)
|
||||
|
Loading…
x
Reference in New Issue
Block a user