mirror of
https://github.com/yihong0618/bilingual_book_maker.git
synced 2025-07-19 08:30:16 +00:00
Add more cases that don't require translation
This commit is contained in:
parent
99bcaddff6
commit
79b2d67cc9
@ -14,14 +14,7 @@ from tqdm import tqdm
|
||||
from book_maker.utils import num_tokens_from_text, prompt_config_to_kwargs
|
||||
|
||||
from .base_loader import BaseBookLoader
|
||||
from .helper import (
|
||||
EPUBBookLoaderHelper,
|
||||
is_text_figure,
|
||||
is_text_link,
|
||||
is_text_list,
|
||||
is_text_source,
|
||||
is_text_tail_link,
|
||||
)
|
||||
from .helper import EPUBBookLoaderHelper, not_trans, is_text_link
|
||||
|
||||
|
||||
class EPUBBookLoader(BaseBookLoader):
|
||||
@ -144,14 +137,7 @@ class EPUBBookLoader(BaseBookLoader):
|
||||
for sup in temp_p.find_all("sup"):
|
||||
sup.extract()
|
||||
if any(
|
||||
[
|
||||
not p.text,
|
||||
self._is_special_text(temp_p.text),
|
||||
is_text_source(temp_p.text),
|
||||
is_text_list(temp_p.text),
|
||||
is_text_figure(temp_p.text),
|
||||
is_text_tail_link(temp_p.text),
|
||||
]
|
||||
[not p.text, self._is_special_text(temp_p.text), not_trans(temp_p.text)]
|
||||
):
|
||||
if i == len(p_list) - 1:
|
||||
self.helper.deal_old(wait_p_list)
|
||||
|
@ -31,7 +31,7 @@ class EPUBBookLoaderHelper:
|
||||
|
||||
def is_text_link(text):
|
||||
url_pattern = re.compile(
|
||||
r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
|
||||
r"(http[s]?://|www\.)+(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
|
||||
)
|
||||
return bool(url_pattern.match(text.strip()))
|
||||
|
||||
@ -56,3 +56,29 @@ def is_text_list(text, num=80):
|
||||
def is_text_figure(text, num=80):
|
||||
text = text.strip()
|
||||
return re.match(r"^Figure\s*\d+", text) and len(text) < num
|
||||
|
||||
|
||||
def is_text_digit_and_space(s):
|
||||
for c in s:
|
||||
if not c.isdigit() and not c.isspace():
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def is_text_isbn(s):
|
||||
pattern = r"^[Ee]?ISBN\s*\d[\d\s]*$"
|
||||
return bool(re.match(pattern, s))
|
||||
|
||||
|
||||
def not_trans(s):
|
||||
return any(
|
||||
[
|
||||
is_text_link(s),
|
||||
is_text_tail_link(s),
|
||||
is_text_source(s),
|
||||
is_text_list(s),
|
||||
is_text_figure(s),
|
||||
is_text_digit_and_space(s),
|
||||
is_text_isbn(s),
|
||||
]
|
||||
)
|
||||
|
Loading…
x
Reference in New Issue
Block a user