Add more cases that don't require translation

2025-07-19 08:30:16 +00:00 · 2023-03-18 19:25:36 +08:00 · 2023-03-18 19:25:36 +08:00 · 79b2d67cc9
commit 79b2d67cc9
parent 99bcaddff6
2 changed files with 29 additions and 17 deletions
--- a/book_maker/loader/epub_loader.py
+++ b/book_maker/loader/epub_loader.py
@ -14,14 +14,7 @@ from tqdm import tqdm
 from book_maker.utils import num_tokens_from_text, prompt_config_to_kwargs
 from .base_loader import BaseBookLoader
-from .helper import (
+from .helper import EPUBBookLoaderHelper, not_trans, is_text_link
    EPUBBookLoaderHelper,
    is_text_figure,
    is_text_link,
    is_text_list,
    is_text_source,
    is_text_tail_link,
 )
 class EPUBBookLoader(BaseBookLoader):
@ -144,14 +137,7 @@ class EPUBBookLoader(BaseBookLoader):
            for sup in temp_p.find_all("sup"):
                sup.extract()
            if any(
-                [
+                [not p.text, self._is_special_text(temp_p.text), not_trans(temp_p.text)]
                    not p.text,
                    self._is_special_text(temp_p.text),
                    is_text_source(temp_p.text),
                    is_text_list(temp_p.text),
                    is_text_figure(temp_p.text),
                    is_text_tail_link(temp_p.text),
                ]
            ):
                if i == len(p_list) - 1:
                    self.helper.deal_old(wait_p_list)
--- a/book_maker/loader/helper.py
+++ b/book_maker/loader/helper.py
@ -31,7 +31,7 @@ class EPUBBookLoaderHelper:
 def is_text_link(text):
    url_pattern = re.compile(
-        r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
+        r"(http[s]?://|www\.)+(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
    )
    return bool(url_pattern.match(text.strip()))
@ -56,3 +56,29 @@ def is_text_list(text, num=80):
 def is_text_figure(text, num=80):
    text = text.strip()
    return re.match(r"^Figure\s*\d+", text) and len(text) < num
 def is_text_digit_and_space(s):
    for c in s:
        if not c.isdigit() and not c.isspace():
            return False
    return True
 def is_text_isbn(s):
    pattern = r"^[Ee]?ISBN\s*\d[\d\s]*$"
    return bool(re.match(pattern, s))
 def not_trans(s):
    return any(
        [
            is_text_link(s),
            is_text_tail_link(s),
            is_text_source(s),
            is_text_list(s),
            is_text_figure(s),
            is_text_digit_and_space(s),
            is_text_isbn(s),
        ]
    )