fix ascii character

This commit is contained in:
Ghost 2024-05-16 18:25:00 +02:00
parent f98fbe18e6
commit 67bbb9a043
7 changed files with 16 additions and 453 deletions

View File

@ -3,6 +3,7 @@
import os
import sys
import logging
from unidecode import unidecode as transliterate
from typing import List
@ -12,7 +13,6 @@ from Src.Util.config import config_manager
from Src.Util.table import TVShowManager
from Src.Util.message import start_message
from Src.Util.os import remove_special_characters
from Src.Lib.Unidecode import transliterate
from Src.Util.file_validation import can_create_file
from Src.Lib.FFmpeg.my_m3u8 import Downloader
from Src.Util.mapper import map_episode_title

View File

@ -1,151 +0,0 @@
# 04.04.24
import os
import logging
import importlib.util
# Variable
Cache = {}
class UnidecodeError(ValueError):
pass
def transliterate_nonascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str:
"""Transliterates non-ASCII characters in a string to their ASCII counterparts.
Args:
string (str): The input string containing non-ASCII characters.
errors (str): Specifies the treatment of errors. Can be 'ignore', 'strict', 'replace', or 'preserve'.
replace_str (str): The replacement string used when errors='replace'.
Returns:
str: The transliterated string with non-ASCII characters replaced.
"""
return _transliterate(string, errors, replace_str)
def _get_ascii_representation(char: str) -> str:
"""Obtains the ASCII representation of a Unicode character.
Args:
char (str): The Unicode character.
Returns:
str: The ASCII representation of the character.
"""
codepoint = ord(char)
# If the character is ASCII, return it as is
if codepoint < 0x80:
return str(char)
# Ignore characters outside the BMP (Basic Multilingual Plane)
if codepoint > 0xeffff:
return None
# Warn about surrogate characters
if 0xd800 <= codepoint <= 0xdfff:
logging.warning("Surrogate character %r will be ignored. "
"You might be using a narrow Python build.", char)
# Calculate section and position
section = codepoint >> 8
position = codepoint % 256
try:
# Look up the character in the cache
table = Cache[section]
except KeyError:
try:
# Import the module corresponding to the section
module_name = f"x{section:03x}.py"
main = os.path.abspath(os.path.dirname(__file__))
module_path = os.path.join(main, module_name)
spec = importlib.util.spec_from_file_location(module_name, module_path)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
except ImportError:
# If module import fails, set cache entry to None and return
Cache[section] = None
return None
# Update cache with module data
Cache[section] = table = module.data
# Return the ASCII representation if found, otherwise None
if table and len(table) > position:
return table[position]
else:
return None
def _transliterate(string: str, errors: str, replace_str: str) -> str:
"""Main transliteration function.
Args:
string (str): The input string.
errors (str): Specifies the treatment of errors. Can be 'ignore', 'strict', 'replace', or 'preserve'.
replace_str (str): The replacement string used when errors='replace'.
Returns:
str: The transliterated string.
"""
retval = []
for char in string:
# Get the ASCII representation of the character
ascii_char = _get_ascii_representation(char)
if ascii_char is None:
# Handle errors based on the specified policy
if errors == 'ignore':
ascii_char = ''
elif errors == 'strict':
logging.error(f'No replacement found for character {char!r}')
raise UnidecodeError(f'no replacement found for character {char!r}')
elif errors == 'replace':
ascii_char = replace_str
elif errors == 'preserve':
ascii_char = char
else:
logging.error(f'Invalid value for errors parameter {errors!r}')
raise UnidecodeError(f'invalid value for errors parameter {errors!r}')
# Append the ASCII representation to the result
retval.append(ascii_char)
return ''.join(retval)
def transliterate_expect_ascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str:
"""Transliterates non-ASCII characters in a string, expecting ASCII input.
Args:
string (str): The input string containing non-ASCII characters.
errors (str): Specifies the treatment of errors. Can be 'ignore', 'strict', 'replace', or 'preserve'.
replace_str (str): The replacement string used when errors='replace'.
Returns:
str: The transliterated string with non-ASCII characters replaced.
"""
try:
# Check if the string can be encoded as ASCII
string.encode('ASCII')
except UnicodeEncodeError:
# If encoding fails, fall back to transliteration
pass
else:
# If the string is already ASCII, return it as is
return string
# Otherwise, transliterate non-ASCII characters
return _transliterate(string, errors, replace_str)
# Out
transliterate = transliterate_expect_ascii

View File

@ -1,258 +0,0 @@
data = (
'\x00', # 0x00
'\x01', # 0x01
'\x02', # 0x02
'\x03', # 0x03
'\x04', # 0x04
'\x05', # 0x05
'\x06', # 0x06
'\x07', # 0x07
'\x08', # 0x08
'\x09', # 0x09
'\x0a', # 0x0a
'\x0b', # 0x0b
'\x0c', # 0x0c
'\x0d', # 0x0d
'\x0e', # 0x0e
'\x0f', # 0x0f
'\x10', # 0x10
'\x11', # 0x11
'\x12', # 0x12
'\x13', # 0x13
'\x14', # 0x14
'\x15', # 0x15
'\x16', # 0x16
'\x17', # 0x17
'\x18', # 0x18
'\x19', # 0x19
'\x1a', # 0x1a
'\x1b', # 0x1b
'\x1c', # 0x1c
'\x1d', # 0x1d
'\x1e', # 0x1e
'\x1f', # 0x1f
' ', # 0x20
'!', # 0x21
'"', # 0x22
'#', # 0x23
'$', # 0x24
'%', # 0x25
'&', # 0x26
'\'', # 0x27
'(', # 0x28
')', # 0x29
'*', # 0x2a
'+', # 0x2b
',', # 0x2c
'-', # 0x2d
'.', # 0x2e
'/', # 0x2f
'0', # 0x30
'1', # 0x31
'2', # 0x32
'3', # 0x33
'4', # 0x34
'5', # 0x35
'6', # 0x36
'7', # 0x37
'8', # 0x38
'9', # 0x39
':', # 0x3a
';', # 0x3b
'<', # 0x3c
'=', # 0x3d
'>', # 0x3e
'?', # 0x3f
'@', # 0x40
'A', # 0x41
'B', # 0x42
'C', # 0x43
'D', # 0x44
'E', # 0x45
'F', # 0x46
'G', # 0x47
'H', # 0x48
'I', # 0x49
'J', # 0x4a
'K', # 0x4b
'L', # 0x4c
'M', # 0x4d
'N', # 0x4e
'O', # 0x4f
'P', # 0x50
'Q', # 0x51
'R', # 0x52
'S', # 0x53
'T', # 0x54
'U', # 0x55
'V', # 0x56
'W', # 0x57
'X', # 0x58
'Y', # 0x59
'Z', # 0x5a
']', # 0x5b
'\\', # 0x5c
']', # 0x5d
'^', # 0x5e
'_', # 0x5f
'`', # 0x60
'a', # 0x61
'b', # 0x62
'c', # 0x63
'd', # 0x64
'e', # 0x65
'f', # 0x66
'g', # 0x67
'h', # 0x68
'i', # 0x69
'j', # 0x6a
'k', # 0x6b
'l', # 0x6c
'm', # 0x6d
'n', # 0x6e
'o', # 0x6f
'p', # 0x70
'q', # 0x71
'r', # 0x72
's', # 0x73
't', # 0x74
'u', # 0x75
'v', # 0x76
'w', # 0x77
'x', # 0x78
'y', # 0x79
'z', # 0x7a
'{', # 0x7b
'|', # 0x7c
'}', # 0x7d
'~', # 0x7e
'', # 0x7f
'', # 0x80
'', # 0x81
'', # 0x82
'', # 0x83
'', # 0x84
'', # 0x85
'', # 0x86
'', # 0x87
'', # 0x88
'', # 0x89
'', # 0x8a
'', # 0x8b
'', # 0x8c
'', # 0x8d
'', # 0x8e
'', # 0x8f
'', # 0x90
'', # 0x91
'', # 0x92
'', # 0x93
'', # 0x94
'', # 0x95
'', # 0x96
'', # 0x97
'', # 0x98
'', # 0x99
'', # 0x9a
'', # 0x9b
'', # 0x9c
'', # 0x9d
'', # 0x9e
'', # 0x9f
' ', # 0xa0
'!', # 0xa1
'C/', # 0xa2
'PS', # 0xa3
'$?', # 0xa4
'Y=', # 0xa5
'|', # 0xa6
'SS', # 0xa7
'"', # 0xa8
'(c)', # 0xa9
'a', # 0xaa
'<<', # 0xab
'!', # 0xac
'', # 0xad
'(r)', # 0xae
'-', # 0xaf
'deg', # 0xb0
'+-', # 0xb1
'2', # 0xb2
'3', # 0xb3
'\'', # 0xb4
'u', # 0xb5
'P', # 0xb6
'*', # 0xb7
',', # 0xb8
'1', # 0xb9
'o', # 0xba
'>>', # 0xbb
'1/4', # 0xbc
'1/2', # 0xbd
'3/4', # 0xbe
'?', # 0xbf
'A', # 0xc0
'A', # 0xc1
'A', # 0xc2
'A', # 0xc3
'A', # 0xc4
'A', # 0xc5
'AE', # 0xc6
'C', # 0xc7
'E', # 0xc8
'E', # 0xc9
'E', # 0xca
'E', # 0xcb
'I', # 0xcc
'I', # 0xcd
'I', # 0xce
'I', # 0xcf
'D', # 0xd0
'N', # 0xd1
'O', # 0xd2
'O', # 0xd3
'O', # 0xd4
'O', # 0xd5
'O', # 0xd6
'x', # 0xd7
'O', # 0xd8
'U', # 0xd9
'U', # 0xda
'U', # 0xdb
'U', # 0xdc
'U', # 0xdd
'Th', # 0xde
'ss', # 0xdf
'a', # 0xe0
'a', # 0xe1
'a', # 0xe2
'a', # 0xe3
'a', # 0xe4
'a', # 0xe5
'ae', # 0xe6
'c', # 0xe7
'e', # 0xe8
'e', # 0xe9
'e', # 0xea
'e', # 0xeb
'i', # 0xec
'i', # 0xed
'i', # 0xee
'i', # 0xef
'd', # 0xf0
'n', # 0xf1
'o', # 0xf2
'o', # 0xf3
'o', # 0xf4
'o', # 0xf5
'o', # 0xf6
'/', # 0xf7
'o', # 0xf8
'u', # 0xf9
'u', # 0xfa
'u', # 0xfb
'u', # 0xfc
'y', # 0xfd
'th', # 0xfe
'y', # 0xff
)

View File

@ -1,42 +0,0 @@
data = (
' ', # 0x00
' ', # 0x01
' ', # 0x02
' ', # 0x03
' ', # 0x04
' ', # 0x05
' ', # 0x06
' ', # 0x07
' ', # 0x08
' ', # 0x09
' ', # 0x0a
' ', # 0x0b
'', # 0x0c
'', # 0x0d
'', # 0x0e
'', # 0x0f
'-', # 0x10
'-', # 0x11
'-', # 0x12
'-', # 0x13
'--', # 0x14
'--', # 0x15
'||', # 0x16
'_', # 0x17
'\'', # 0x18
'\'', # 0x19
',', # 0x1a
'\'', # 0x1b
'"', # 0x1c
'"', # 0x1d
',,', # 0x1e
'"', # 0x1f
'+', # 0x20
'++', # 0x21
'*', # 0x22
'*>', # 0x23
'.', # 0x24
'..', # 0x25
'...', # 0x26
'.', # 0x27
)

View File

@ -1,8 +1,9 @@
# 10.04.24
from unidecode import unidecode as transliterate
# Internal utilities
from Src.Lib.Unidecode import transliterate
from Src.Util.config import config_manager
from Src.Api.Class.EpisodeType import Episode

13
file_list.txt Normal file
View File

@ -0,0 +1,13 @@
file 'Video\Series\pechino-express\S2\pechino-express_S02E01_Ha Noi - Vinh\tmp\video\0.ts'
file 'Video\Series\pechino-express\S2\pechino-express_S02E01_Ha Noi - Vinh\tmp\video\1.ts'
file 'Video\Series\pechino-express\S2\pechino-express_S02E01_Ha Noi - Vinh\tmp\video\2.ts'
file 'Video\Series\pechino-express\S2\pechino-express_S02E01_Ha Noi - Vinh\tmp\video\3.ts'
file 'Video\Series\pechino-express\S2\pechino-express_S02E01_Ha Noi - Vinh\tmp\video\4.ts'
file 'Video\Series\pechino-express\S2\pechino-express_S02E01_Ha Noi - Vinh\tmp\video\5.ts'
file 'Video\Series\pechino-express\S2\pechino-express_S02E01_Ha Noi - Vinh\tmp\video\6.ts'
file 'Video\Series\pechino-express\S2\pechino-express_S02E01_Ha Noi - Vinh\tmp\video\7.ts'
file 'Video\Series\pechino-express\S2\pechino-express_S02E01_Ha Noi - Vinh\tmp\video\8.ts'
file 'Video\Series\pechino-express\S2\pechino-express_S02E01_Ha Noi - Vinh\tmp\video\9.ts'
file 'Video\Series\pechino-express\S2\pechino-express_S02E01_Ha Noi - Vinh\tmp\video\10.ts'
file 'Video\Series\pechino-express\S2\pechino-express_S02E01_Ha Noi - Vinh\tmp\video\11.ts'
file 'Video\Series\pechino-express\S2\pechino-express_S02E01_Ha Noi - Vinh\tmp\video\12.ts'

Binary file not shown.