Add more cases that don't require translation

This commit is contained in:
h 2023-03-18 19:25:36 +08:00
parent 99bcaddff6
commit 79b2d67cc9
2 changed files with 29 additions and 17 deletions

View File

@ -14,14 +14,7 @@ from tqdm import tqdm
from book_maker.utils import num_tokens_from_text, prompt_config_to_kwargs from book_maker.utils import num_tokens_from_text, prompt_config_to_kwargs
from .base_loader import BaseBookLoader from .base_loader import BaseBookLoader
from .helper import ( from .helper import EPUBBookLoaderHelper, not_trans, is_text_link
EPUBBookLoaderHelper,
is_text_figure,
is_text_link,
is_text_list,
is_text_source,
is_text_tail_link,
)
class EPUBBookLoader(BaseBookLoader): class EPUBBookLoader(BaseBookLoader):
@ -144,14 +137,7 @@ class EPUBBookLoader(BaseBookLoader):
for sup in temp_p.find_all("sup"): for sup in temp_p.find_all("sup"):
sup.extract() sup.extract()
if any( if any(
[ [not p.text, self._is_special_text(temp_p.text), not_trans(temp_p.text)]
not p.text,
self._is_special_text(temp_p.text),
is_text_source(temp_p.text),
is_text_list(temp_p.text),
is_text_figure(temp_p.text),
is_text_tail_link(temp_p.text),
]
): ):
if i == len(p_list) - 1: if i == len(p_list) - 1:
self.helper.deal_old(wait_p_list) self.helper.deal_old(wait_p_list)

View File

@ -31,7 +31,7 @@ class EPUBBookLoaderHelper:
def is_text_link(text): def is_text_link(text):
url_pattern = re.compile( url_pattern = re.compile(
r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+" r"(http[s]?://|www\.)+(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
) )
return bool(url_pattern.match(text.strip())) return bool(url_pattern.match(text.strip()))
@ -56,3 +56,29 @@ def is_text_list(text, num=80):
def is_text_figure(text, num=80): def is_text_figure(text, num=80):
text = text.strip() text = text.strip()
return re.match(r"^Figure\s*\d+", text) and len(text) < num return re.match(r"^Figure\s*\d+", text) and len(text) < num
def is_text_digit_and_space(s):
for c in s:
if not c.isdigit() and not c.isspace():
return False
return True
def is_text_isbn(s):
pattern = r"^[Ee]?ISBN\s*\d[\d\s]*$"
return bool(re.match(pattern, s))
def not_trans(s):
return any(
[
is_text_link(s),
is_text_tail_link(s),
is_text_source(s),
is_text_list(s),
is_text_figure(s),
is_text_digit_and_space(s),
is_text_isbn(s),
]
)