mirror of
https://github.com/yihong0618/bilingual_book_maker.git
synced 2025-06-05 19:15:34 +00:00
133 lines
3.6 KiB
Python
133 lines
3.6 KiB
Python
import re
|
|
import backoff
|
|
import logging
|
|
from copy import copy
|
|
|
|
logging.basicConfig(level=logging.WARNING)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class EPUBBookLoaderHelper:
|
|
def __init__(
|
|
self, translate_model, accumulated_num, translation_style, context_flag
|
|
):
|
|
self.translate_model = translate_model
|
|
self.accumulated_num = accumulated_num
|
|
self.translation_style = translation_style
|
|
self.context_flag = context_flag
|
|
|
|
def insert_trans(self, p, text, translation_style="", single_translate=False):
|
|
if text is None:
|
|
text = ""
|
|
if (
|
|
p.string is not None
|
|
and p.string.replace(" ", "").strip() == text.replace(" ", "").strip()
|
|
):
|
|
return
|
|
new_p = copy(p)
|
|
new_p.string = text
|
|
if translation_style != "":
|
|
new_p["style"] = translation_style
|
|
p.insert_after(new_p)
|
|
if single_translate:
|
|
p.extract()
|
|
|
|
@backoff.on_exception(
|
|
backoff.expo,
|
|
Exception,
|
|
on_backoff=lambda details: logger.warning(f"retry backoff: {details}"),
|
|
on_giveup=lambda details: logger.warning(f"retry abort: {details}"),
|
|
jitter=None,
|
|
)
|
|
def translate_with_backoff(self, text, context_flag=False):
|
|
return self.translate_model.translate(text, context_flag)
|
|
|
|
def deal_new(self, p, wait_p_list, single_translate=False):
|
|
self.deal_old(wait_p_list, single_translate, self.context_flag)
|
|
self.insert_trans(
|
|
p,
|
|
shorter_result_link(self.translate_with_backoff(p.text, self.context_flag)),
|
|
self.translation_style,
|
|
single_translate,
|
|
)
|
|
|
|
def deal_old(self, wait_p_list, single_translate=False, context_flag=False):
|
|
if not wait_p_list:
|
|
return
|
|
|
|
result_txt_list = self.translate_model.translate_list(wait_p_list)
|
|
|
|
for i in range(len(wait_p_list)):
|
|
if i < len(result_txt_list):
|
|
p = wait_p_list[i]
|
|
self.insert_trans(
|
|
p,
|
|
shorter_result_link(result_txt_list[i]),
|
|
self.translation_style,
|
|
single_translate,
|
|
)
|
|
|
|
wait_p_list.clear()
|
|
|
|
|
|
url_pattern = r"(http[s]?://|www\.)+(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
|
|
|
|
|
|
def is_text_link(text):
|
|
return bool(re.compile(url_pattern).match(text.strip()))
|
|
|
|
|
|
def is_text_tail_link(text, num=80):
|
|
text = text.strip()
|
|
pattern = r".*" + url_pattern + r"$"
|
|
return bool(re.compile(pattern).match(text)) and len(text) < num
|
|
|
|
|
|
def shorter_result_link(text, num=20):
|
|
match = re.search(url_pattern, text)
|
|
|
|
if not match or len(match.group()) < num:
|
|
return text
|
|
|
|
return re.compile(url_pattern).sub("...", text)
|
|
|
|
|
|
def is_text_source(text):
|
|
return text.strip().startswith("Source: ")
|
|
|
|
|
|
def is_text_list(text, num=80):
|
|
text = text.strip()
|
|
return re.match(r"^Listing\s*\d+", text) and len(text) < num
|
|
|
|
|
|
def is_text_figure(text, num=80):
|
|
text = text.strip()
|
|
return re.match(r"^Figure\s*\d+", text) and len(text) < num
|
|
|
|
|
|
def is_text_digit_and_space(s):
|
|
for c in s:
|
|
if not c.isdigit() and not c.isspace():
|
|
return False
|
|
return True
|
|
|
|
|
|
def is_text_isbn(s):
|
|
pattern = r"^[Ee]?ISBN\s*\d[\d\s]*$"
|
|
return bool(re.match(pattern, s))
|
|
|
|
|
|
def not_trans(s):
|
|
return any(
|
|
[
|
|
is_text_link(s),
|
|
is_text_tail_link(s),
|
|
is_text_source(s),
|
|
is_text_list(s),
|
|
is_text_figure(s),
|
|
is_text_digit_and_space(s),
|
|
is_text_isbn(s),
|
|
]
|
|
)
|