mirror of
https://github.com/Arrowar/StreamingCommunity.git
synced 2025-06-07 20:15:24 +00:00
Fix unidecode
This commit is contained in:
parent
9d1b7536b7
commit
3a99ca2afe
@ -5,7 +5,7 @@ from Src.Util.console import console, msg
|
|||||||
from Src.Util.config import config_manager
|
from Src.Util.config import config_manager
|
||||||
from Src.Util.table import TVShowManager
|
from Src.Util.table import TVShowManager
|
||||||
from Src.Util.message import start_message
|
from Src.Util.message import start_message
|
||||||
from Src.Lib.Unidecode import unidecode
|
from Src.Lib.Unidecode import transliterate
|
||||||
from Src.Lib.FFmpeg.my_m3u8 import Downloader
|
from Src.Lib.FFmpeg.my_m3u8 import Downloader
|
||||||
from .Class import VideoSource
|
from .Class import VideoSource
|
||||||
|
|
||||||
@ -110,7 +110,7 @@ def donwload_video(tv_name: str, index_season_selected: int, index_episode_selec
|
|||||||
episode_id = video_source.obj_episode_manager.episodes[index_episode_selected - 1].id
|
episode_id = video_source.obj_episode_manager.episodes[index_episode_selected - 1].id
|
||||||
|
|
||||||
# Define filename and path for the downloaded video
|
# Define filename and path for the downloaded video
|
||||||
mp4_name = f"{index_episode_selected}_{unidecode(video_source.obj_episode_manager.episodes[index_episode_selected - 1].name)}.mp4"
|
mp4_name = f"{index_episode_selected}_{transliterate(video_source.obj_episode_manager.episodes[index_episode_selected - 1].name)}.mp4"
|
||||||
mp4_path = os.path.join(ROOT_PATH, SERIES_FOLDER, tv_name, f"S{index_season_selected}")
|
mp4_path = os.path.join(ROOT_PATH, SERIES_FOLDER, tv_name, f"S{index_season_selected}")
|
||||||
os.makedirs(mp4_path, exist_ok=True)
|
os.makedirs(mp4_path, exist_ok=True)
|
||||||
|
|
||||||
|
@ -1,138 +1,143 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# 04.04.24
|
||||||
# vi:tabstop=4:expandtab:sw=4
|
|
||||||
"""Transliterate Unicode text into plain 7-bit ASCII.
|
|
||||||
|
|
||||||
Example usage:
|
|
||||||
|
|
||||||
>>> from unidecode import unidecode
|
# Import
|
||||||
>>> unidecode("\u5317\u4EB0")
|
import os
|
||||||
"Bei Jing "
|
import logging
|
||||||
|
import importlib.util
|
||||||
|
|
||||||
The transliteration uses a straightforward map, and doesn't have alternatives
|
# Variable
|
||||||
for the same character based on language, position, or anything else.
|
Cache = {}
|
||||||
|
|
||||||
A standard string object will be returned. If you need bytes, use:
|
|
||||||
|
|
||||||
>>> unidecode("Κνωσός").encode("ascii")
|
|
||||||
b'Knosos'
|
|
||||||
"""
|
|
||||||
import warnings
|
|
||||||
from typing import Dict, Optional, Sequence
|
|
||||||
|
|
||||||
Cache = {} # type: Dict[int, Optional[Sequence[Optional[str]]]]
|
|
||||||
|
|
||||||
class UnidecodeError(ValueError):
|
class UnidecodeError(ValueError):
|
||||||
def __init__(self, message: str, index: Optional[int] = None) -> None:
|
|
||||||
"""Raised for Unidecode-related errors.
|
|
||||||
|
|
||||||
The index attribute contains the index of the character that caused
|
|
||||||
the error.
|
|
||||||
"""
|
|
||||||
super(UnidecodeError, self).__init__(message)
|
|
||||||
self.index = index
|
|
||||||
|
|
||||||
|
|
||||||
def unidecode_expect_ascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str:
|
|
||||||
"""Transliterate an Unicode object into an ASCII string
|
|
||||||
|
|
||||||
>>> unidecode("\u5317\u4EB0")
|
|
||||||
"Bei Jing "
|
|
||||||
|
|
||||||
This function first tries to convert the string using ASCII codec.
|
|
||||||
If it fails (because of non-ASCII characters), it falls back to
|
|
||||||
transliteration using the character tables.
|
|
||||||
|
|
||||||
This is approx. five times faster if the string only contains ASCII
|
|
||||||
characters, but slightly slower than unicode_expect_nonascii if
|
|
||||||
non-ASCII characters are present.
|
|
||||||
|
|
||||||
errors specifies what to do with characters that have not been
|
|
||||||
found in replacement tables. The default is 'ignore' which ignores
|
|
||||||
the character. 'strict' raises an UnidecodeError. 'replace'
|
|
||||||
substitutes the character with replace_str (default is '?').
|
|
||||||
'preserve' keeps the original character.
|
|
||||||
|
|
||||||
Note that if 'preserve' is used the returned string might not be
|
|
||||||
ASCII!
|
|
||||||
"""
|
|
||||||
|
|
||||||
try:
|
|
||||||
bytestring = string.encode('ASCII')
|
|
||||||
except UnicodeEncodeError:
|
|
||||||
pass
|
pass
|
||||||
else:
|
|
||||||
return string
|
|
||||||
|
|
||||||
return _unidecode(string, errors, replace_str)
|
def transliterate_nonascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str:
|
||||||
|
"""Transliterates non-ASCII characters in a string to their ASCII counterparts.
|
||||||
|
|
||||||
def unidecode_expect_nonascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str:
|
Args:
|
||||||
"""Transliterate an Unicode object into an ASCII string
|
string (str): The input string containing non-ASCII characters.
|
||||||
|
errors (str): Specifies the treatment of errors. Can be 'ignore', 'strict', 'replace', or 'preserve'.
|
||||||
|
replace_str (str): The replacement string used when errors='replace'.
|
||||||
|
|
||||||
>>> unidecode("\u5317\u4EB0")
|
Returns:
|
||||||
"Bei Jing "
|
str: The transliterated string with non-ASCII characters replaced.
|
||||||
|
|
||||||
See unidecode_expect_ascii.
|
|
||||||
"""
|
"""
|
||||||
|
return _transliterate(string, errors, replace_str)
|
||||||
|
|
||||||
return _unidecode(string, errors, replace_str)
|
def _get_ascii_representation(char: str) -> str:
|
||||||
|
"""Obtains the ASCII representation of a Unicode character.
|
||||||
|
|
||||||
unidecode = unidecode_expect_ascii
|
Args:
|
||||||
|
char (str): The Unicode character.
|
||||||
|
|
||||||
def _get_repl_str(char: str) -> Optional[str]:
|
Returns:
|
||||||
|
str: The ASCII representation of the character.
|
||||||
|
"""
|
||||||
codepoint = ord(char)
|
codepoint = ord(char)
|
||||||
|
|
||||||
|
# If the character is ASCII, return it as is
|
||||||
if codepoint < 0x80:
|
if codepoint < 0x80:
|
||||||
# Already ASCII
|
|
||||||
return str(char)
|
return str(char)
|
||||||
|
|
||||||
|
# Ignore characters outside the BMP (Basic Multilingual Plane)
|
||||||
if codepoint > 0xeffff:
|
if codepoint > 0xeffff:
|
||||||
# No data on characters in Private Use Area and above.
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
# Warn about surrogate characters
|
||||||
if 0xd800 <= codepoint <= 0xdfff:
|
if 0xd800 <= codepoint <= 0xdfff:
|
||||||
warnings.warn( "Surrogate character %r will be ignored. "
|
logging.warning("Surrogate character %r will be ignored. "
|
||||||
"You might be using a narrow Python build." % (char,),
|
"You might be using a narrow Python build.", char)
|
||||||
RuntimeWarning, 2)
|
|
||||||
|
|
||||||
section = codepoint >> 8 # Chop off the last two hex digits
|
# Calculate section and position
|
||||||
position = codepoint % 256 # Last two hex digits
|
section = codepoint >> 8
|
||||||
|
position = codepoint % 256
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
# Look up the character in the cache
|
||||||
table = Cache[section]
|
table = Cache[section]
|
||||||
except KeyError:
|
except KeyError:
|
||||||
try:
|
try:
|
||||||
mod = __import__('unidecode.x%03x'%(section), globals(), locals(), ['data'])
|
# Import the module corresponding to the section
|
||||||
|
module_name = f"x{section:03x}.py"
|
||||||
|
main = os.path.abspath(os.path.dirname(__file__))
|
||||||
|
module_path = os.path.join(main, module_name)
|
||||||
|
spec = importlib.util.spec_from_file_location(module_name, module_path)
|
||||||
|
module = importlib.util.module_from_spec(spec)
|
||||||
|
spec.loader.exec_module(module)
|
||||||
except ImportError:
|
except ImportError:
|
||||||
# No data on this character
|
# If module import fails, set cache entry to None and return
|
||||||
Cache[section] = None
|
Cache[section] = None
|
||||||
return None
|
return None
|
||||||
|
|
||||||
Cache[section] = table = mod.data
|
# Update cache with module data
|
||||||
|
Cache[section] = table = module.data
|
||||||
|
|
||||||
|
# Return the ASCII representation if found, otherwise None
|
||||||
if table and len(table) > position:
|
if table and len(table) > position:
|
||||||
return table[position]
|
return table[position]
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _unidecode(string: str, errors: str, replace_str:str) -> str:
|
def _transliterate(string: str, errors: str, replace_str: str) -> str:
|
||||||
|
"""Main transliteration function.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
string (str): The input string.
|
||||||
|
errors (str): Specifies the treatment of errors. Can be 'ignore', 'strict', 'replace', or 'preserve'.
|
||||||
|
replace_str (str): The replacement string used when errors='replace'.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The transliterated string.
|
||||||
|
"""
|
||||||
retval = []
|
retval = []
|
||||||
|
|
||||||
for index, char in enumerate(string):
|
for char in string:
|
||||||
repl = _get_repl_str(char)
|
# Get the ASCII representation of the character
|
||||||
|
ascii_char = _get_ascii_representation(char)
|
||||||
|
|
||||||
if repl is None:
|
if ascii_char is None:
|
||||||
|
# Handle errors based on the specified policy
|
||||||
if errors == 'ignore':
|
if errors == 'ignore':
|
||||||
repl = ''
|
ascii_char = ''
|
||||||
elif errors == 'strict':
|
elif errors == 'strict':
|
||||||
raise UnidecodeError('no replacement found for character %r '
|
logging.error(f'No replacement found for character {char!r}')
|
||||||
'in position %d' % (char, index), index)
|
raise UnidecodeError(f'no replacement found for character {char!r}')
|
||||||
elif errors == 'replace':
|
elif errors == 'replace':
|
||||||
repl = replace_str
|
ascii_char = replace_str
|
||||||
elif errors == 'preserve':
|
elif errors == 'preserve':
|
||||||
repl = char
|
ascii_char = char
|
||||||
else:
|
else:
|
||||||
raise UnidecodeError('invalid value for errors parameter %r' % (errors,))
|
logging.error(f'Invalid value for errors parameter {errors!r}')
|
||||||
|
raise UnidecodeError(f'invalid value for errors parameter {errors!r}')
|
||||||
|
|
||||||
retval.append(repl)
|
# Append the ASCII representation to the result
|
||||||
|
retval.append(ascii_char)
|
||||||
|
|
||||||
return ''.join(retval)
|
return ''.join(retval)
|
||||||
|
|
||||||
|
def transliterate_expect_ascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str:
|
||||||
|
"""Transliterates non-ASCII characters in a string, expecting ASCII input.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
string (str): The input string containing non-ASCII characters.
|
||||||
|
errors (str): Specifies the treatment of errors. Can be 'ignore', 'strict', 'replace', or 'preserve'.
|
||||||
|
replace_str (str): The replacement string used when errors='replace'.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The transliterated string with non-ASCII characters replaced.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Check if the string can be encoded as ASCII
|
||||||
|
string.encode('ASCII')
|
||||||
|
except UnicodeEncodeError:
|
||||||
|
# If encoding fails, fall back to transliteration
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
# If the string is already ASCII, return it as is
|
||||||
|
return string
|
||||||
|
|
||||||
|
# Otherwise, transliterate non-ASCII characters
|
||||||
|
return _transliterate(string, errors, replace_str)
|
||||||
|
|
||||||
|
transliterate = transliterate_expect_ascii
|
||||||
|
Loading…
x
Reference in New Issue
Block a user