hleft a0c999a2e6
support single_translate in epub (#237)
* support no_bilingual

* use single_translate instead no_bilingual
2023-04-09 21:20:41 +08:00

113 lines
3.0 KiB
Python

import re
from copy import copy
class EPUBBookLoaderHelper:
def __init__(self, translate_model, accumulated_num, translation_style):
self.translate_model = translate_model
self.accumulated_num = accumulated_num
self.translation_style = translation_style
def insert_trans(self, p, text, translation_style="", single_translate=False):
if (
p.string is not None
and p.string.replace(" ", "").strip() == text.replace(" ", "").strip()
):
return
new_p = copy(p)
new_p.string = text
if translation_style != "":
new_p["style"] = translation_style
p.insert_after(new_p)
if single_translate:
p.extract()
def deal_new(self, p, wait_p_list, single_translate=False):
self.deal_old(wait_p_list, single_translate)
self.insert_trans(
p,
shorter_result_link(self.translate_model.translate(p.text)),
self.translation_style,
single_translate,
)
def deal_old(self, wait_p_list, single_translate=False):
if not wait_p_list:
return
result_txt_list = self.translate_model.translate_list(wait_p_list)
for i in range(len(wait_p_list)):
if i < len(result_txt_list):
p = wait_p_list[i]
self.insert_trans(
p,
shorter_result_link(result_txt_list[i]),
self.translation_style,
single_translate,
)
wait_p_list.clear()
url_pattern = r"(http[s]?://|www\.)+(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
def is_text_link(text):
return bool(re.compile(url_pattern).match(text.strip()))
def is_text_tail_link(text, num=80):
text = text.strip()
pattern = r".*" + url_pattern + r"$"
return bool(re.compile(pattern).match(text)) and len(text) < num
def shorter_result_link(text, num=20):
match = re.search(url_pattern, text)
if not match or len(match.group()) < num:
return text
return re.compile(url_pattern).sub("...", text)
def is_text_source(text):
return text.strip().startswith("Source: ")
def is_text_list(text, num=80):
text = text.strip()
return re.match(r"^Listing\s*\d+", text) and len(text) < num
def is_text_figure(text, num=80):
text = text.strip()
return re.match(r"^Figure\s*\d+", text) and len(text) < num
def is_text_digit_and_space(s):
for c in s:
if not c.isdigit() and not c.isspace():
return False
return True
def is_text_isbn(s):
pattern = r"^[Ee]?ISBN\s*\d[\d\s]*$"
return bool(re.match(pattern, s))
def not_trans(s):
return any(
[
is_text_link(s),
is_text_tail_link(s),
is_text_source(s),
is_text_list(s),
is_text_figure(s),
is_text_digit_and_space(s),
is_text_isbn(s),
]
)