diff --git a/book_maker/loader/epub_loader.py b/book_maker/loader/epub_loader.py index c4b9a1c..1d910fa 100644 --- a/book_maker/loader/epub_loader.py +++ b/book_maker/loader/epub_loader.py @@ -14,14 +14,7 @@ from tqdm import tqdm from book_maker.utils import num_tokens_from_text, prompt_config_to_kwargs from .base_loader import BaseBookLoader -from .helper import ( - EPUBBookLoaderHelper, - is_text_figure, - is_text_link, - is_text_list, - is_text_source, - is_text_tail_link, -) +from .helper import EPUBBookLoaderHelper, not_trans, is_text_link class EPUBBookLoader(BaseBookLoader): @@ -144,14 +137,7 @@ class EPUBBookLoader(BaseBookLoader): for sup in temp_p.find_all("sup"): sup.extract() if any( - [ - not p.text, - self._is_special_text(temp_p.text), - is_text_source(temp_p.text), - is_text_list(temp_p.text), - is_text_figure(temp_p.text), - is_text_tail_link(temp_p.text), - ] + [not p.text, self._is_special_text(temp_p.text), not_trans(temp_p.text)] ): if i == len(p_list) - 1: self.helper.deal_old(wait_p_list) diff --git a/book_maker/loader/helper.py b/book_maker/loader/helper.py index 52d1d87..34de65c 100644 --- a/book_maker/loader/helper.py +++ b/book_maker/loader/helper.py @@ -31,7 +31,7 @@ class EPUBBookLoaderHelper: def is_text_link(text): url_pattern = re.compile( - r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+" + r"(http[s]?://|www\.)+(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+" ) return bool(url_pattern.match(text.strip())) @@ -56,3 +56,29 @@ def is_text_list(text, num=80): def is_text_figure(text, num=80): text = text.strip() return re.match(r"^Figure\s*\d+", text) and len(text) < num + + +def is_text_digit_and_space(s): + for c in s: + if not c.isdigit() and not c.isspace(): + return False + return True + + +def is_text_isbn(s): + pattern = r"^[Ee]?ISBN\s*\d[\d\s]*$" + return bool(re.match(pattern, s)) + + +def not_trans(s): + return any( + [ + is_text_link(s), + is_text_tail_link(s), + is_text_source(s), + is_text_list(s), + is_text_figure(s), + is_text_digit_and_space(s), + is_text_isbn(s), + ] + )