mirror of
https://github.com/yihong0618/bilingual_book_maker.git
synced 2025-07-19 08:30:16 +00:00
Add more cases that don't require translation
This commit is contained in:
parent
99bcaddff6
commit
79b2d67cc9
@ -14,14 +14,7 @@ from tqdm import tqdm
|
|||||||
from book_maker.utils import num_tokens_from_text, prompt_config_to_kwargs
|
from book_maker.utils import num_tokens_from_text, prompt_config_to_kwargs
|
||||||
|
|
||||||
from .base_loader import BaseBookLoader
|
from .base_loader import BaseBookLoader
|
||||||
from .helper import (
|
from .helper import EPUBBookLoaderHelper, not_trans, is_text_link
|
||||||
EPUBBookLoaderHelper,
|
|
||||||
is_text_figure,
|
|
||||||
is_text_link,
|
|
||||||
is_text_list,
|
|
||||||
is_text_source,
|
|
||||||
is_text_tail_link,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class EPUBBookLoader(BaseBookLoader):
|
class EPUBBookLoader(BaseBookLoader):
|
||||||
@ -144,14 +137,7 @@ class EPUBBookLoader(BaseBookLoader):
|
|||||||
for sup in temp_p.find_all("sup"):
|
for sup in temp_p.find_all("sup"):
|
||||||
sup.extract()
|
sup.extract()
|
||||||
if any(
|
if any(
|
||||||
[
|
[not p.text, self._is_special_text(temp_p.text), not_trans(temp_p.text)]
|
||||||
not p.text,
|
|
||||||
self._is_special_text(temp_p.text),
|
|
||||||
is_text_source(temp_p.text),
|
|
||||||
is_text_list(temp_p.text),
|
|
||||||
is_text_figure(temp_p.text),
|
|
||||||
is_text_tail_link(temp_p.text),
|
|
||||||
]
|
|
||||||
):
|
):
|
||||||
if i == len(p_list) - 1:
|
if i == len(p_list) - 1:
|
||||||
self.helper.deal_old(wait_p_list)
|
self.helper.deal_old(wait_p_list)
|
||||||
|
@ -31,7 +31,7 @@ class EPUBBookLoaderHelper:
|
|||||||
|
|
||||||
def is_text_link(text):
|
def is_text_link(text):
|
||||||
url_pattern = re.compile(
|
url_pattern = re.compile(
|
||||||
r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
|
r"(http[s]?://|www\.)+(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
|
||||||
)
|
)
|
||||||
return bool(url_pattern.match(text.strip()))
|
return bool(url_pattern.match(text.strip()))
|
||||||
|
|
||||||
@ -56,3 +56,29 @@ def is_text_list(text, num=80):
|
|||||||
def is_text_figure(text, num=80):
|
def is_text_figure(text, num=80):
|
||||||
text = text.strip()
|
text = text.strip()
|
||||||
return re.match(r"^Figure\s*\d+", text) and len(text) < num
|
return re.match(r"^Figure\s*\d+", text) and len(text) < num
|
||||||
|
|
||||||
|
|
||||||
|
def is_text_digit_and_space(s):
|
||||||
|
for c in s:
|
||||||
|
if not c.isdigit() and not c.isspace():
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def is_text_isbn(s):
|
||||||
|
pattern = r"^[Ee]?ISBN\s*\d[\d\s]*$"
|
||||||
|
return bool(re.match(pattern, s))
|
||||||
|
|
||||||
|
|
||||||
|
def not_trans(s):
|
||||||
|
return any(
|
||||||
|
[
|
||||||
|
is_text_link(s),
|
||||||
|
is_text_tail_link(s),
|
||||||
|
is_text_source(s),
|
||||||
|
is_text_list(s),
|
||||||
|
is_text_figure(s),
|
||||||
|
is_text_digit_and_space(s),
|
||||||
|
is_text_isbn(s),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user