mirror of
https://github.com/yihong0618/bilingual_book_maker.git
synced 2025-06-05 19:15:34 +00:00
Feat: combine multiple lines into one block, add a new option --block_size (#370)
* Feat: combine multiple lines into one block bug: some text is not replaced with translation * Fix: some text are not translated known issue: 1. sometime the original text show up 2. resume function not working * Style: clean up code
This commit is contained in:
parent
1d7685b86f
commit
40aaa9b090
@ -269,6 +269,12 @@ So you are close to reaching the limit. You have to choose your own value, there
|
||||
default=1.0,
|
||||
help="temperature parameter for `chatgptapi`/`gpt4`/`claude`",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--block_size",
|
||||
type=int,
|
||||
default=-1,
|
||||
help="merge multiple paragraphs into one block, may increase accuracy and speed up the process, but disturb the original format, must be used with `--single_translate`",
|
||||
)
|
||||
|
||||
options = parser.parse_args()
|
||||
|
||||
@ -338,6 +344,11 @@ So you are close to reaching the limit. You have to choose your own value, there
|
||||
f"now only support files of these formats: {','.join(support_type_list)}",
|
||||
)
|
||||
|
||||
if options.block_size > 0 and not options.single_translate:
|
||||
raise Exception(
|
||||
"block_size must be used with `--single_translate` because it disturbs the original format",
|
||||
)
|
||||
|
||||
book_loader = BOOK_LOADER_DICT.get(book_type)
|
||||
assert book_loader is not None, "unsupported loader"
|
||||
language = options.language
|
||||
@ -394,6 +405,8 @@ So you are close to reaching the limit. You have to choose your own value, there
|
||||
# TODO refactor, quick fix for gpt4 model
|
||||
if options.model == "gpt4":
|
||||
e.translate_model.set_gpt4_models("gpt4")
|
||||
if options.block_size > 0:
|
||||
e.block_size = options.block_size
|
||||
|
||||
e.make_bilingual_book()
|
||||
|
||||
|
@ -62,6 +62,7 @@ class EPUBBookLoader(BaseBookLoader):
|
||||
self.exclude_filelist = ""
|
||||
self.only_filelist = ""
|
||||
self.single_translate = single_translate
|
||||
self.block_size = -1
|
||||
|
||||
# monkey patch for # 173
|
||||
def _write_items_patch(obj):
|
||||
@ -126,21 +127,18 @@ class EPUBBookLoader(BaseBookLoader):
|
||||
new_book.toc = book.toc
|
||||
return new_book
|
||||
|
||||
def _process_paragraph(self, p, index, p_to_save_len):
|
||||
if not p.text or self._is_special_text(p.text):
|
||||
return index
|
||||
|
||||
new_p = copy(p)
|
||||
|
||||
def _extract_paragraph(self, p):
|
||||
for p_exclude in self.exclude_translate_tags.split(","):
|
||||
# for issue #280
|
||||
if type(p) == NavigableString:
|
||||
continue
|
||||
for pt in new_p.find_all(p_exclude):
|
||||
for pt in p.find_all(p_exclude):
|
||||
pt.extract()
|
||||
return p
|
||||
|
||||
def _process_paragraph(self, p, new_p, index, p_to_save_len):
|
||||
if self.resume and index < p_to_save_len:
|
||||
new_p.string = self.p_to_save[index]
|
||||
p.string = self.p_to_save[index]
|
||||
else:
|
||||
if type(p) == NavigableString:
|
||||
new_p = self.translate_model.translate(new_p.text)
|
||||
@ -156,7 +154,46 @@ class EPUBBookLoader(BaseBookLoader):
|
||||
|
||||
if index % 20 == 0:
|
||||
self._save_progress()
|
||||
return index
|
||||
|
||||
def _process_combined_paragraph(self, p_block, index, p_to_save_len):
|
||||
text = []
|
||||
|
||||
for p in p_block:
|
||||
if self.resume and index < p_to_save_len:
|
||||
p.string = self.p_to_save[index]
|
||||
else:
|
||||
p_text = p.text.rstrip()
|
||||
text.append(p_text)
|
||||
|
||||
if self.is_test and index >= self.test_num:
|
||||
break
|
||||
|
||||
index += 1
|
||||
|
||||
if len(text) > 0:
|
||||
translated_text = self.translate_model.translate("\n".join(text))
|
||||
translated_text = translated_text.split("\n")
|
||||
text_len = len(translated_text)
|
||||
|
||||
for i in range(text_len):
|
||||
t = translated_text[i]
|
||||
|
||||
if i >= len(p_block):
|
||||
p = p_block[-1]
|
||||
else:
|
||||
p = p_block[i]
|
||||
|
||||
if type(p) == NavigableString:
|
||||
p = t
|
||||
else:
|
||||
p.string = t
|
||||
|
||||
self.helper.insert_trans(
|
||||
p, p.string, self.translation_style, self.single_translate
|
||||
)
|
||||
|
||||
self._save_progress()
|
||||
return index
|
||||
|
||||
def translate_paragraphs_acc(self, p_list, send_num):
|
||||
@ -377,15 +414,39 @@ class EPUBBookLoader(BaseBookLoader):
|
||||
self.translate_paragraphs_acc(p_list, send_num)
|
||||
else:
|
||||
is_test_done = self.is_test and index > self.test_num
|
||||
p_block = []
|
||||
block_len = 0
|
||||
for p in p_list:
|
||||
if is_test_done:
|
||||
break
|
||||
index = self._process_paragraph(p, index, p_to_save_len)
|
||||
if not p.text or self._is_special_text(p.text):
|
||||
pbar.update(1)
|
||||
continue
|
||||
|
||||
new_p = self._extract_paragraph(copy(p))
|
||||
if self.single_translate and self.block_size > 0:
|
||||
p_len = num_tokens_from_text(new_p.text)
|
||||
block_len += p_len
|
||||
if block_len > self.block_size:
|
||||
index = self._process_combined_paragraph(
|
||||
p_block, index, p_to_save_len
|
||||
)
|
||||
p_block = [p]
|
||||
block_len = p_len
|
||||
print()
|
||||
else:
|
||||
p_block.append(p)
|
||||
else:
|
||||
index = self._process_paragraph(p, new_p, index, p_to_save_len)
|
||||
print()
|
||||
|
||||
# pbar.update(delta) not pbar.update(index)?
|
||||
pbar.update(1)
|
||||
print()
|
||||
|
||||
if self.is_test and index >= self.test_num:
|
||||
break
|
||||
if self.single_translate and self.block_size > 0 and len(p_block) > 0:
|
||||
index = self._process_combined_paragraph(p_block, index, p_to_save_len)
|
||||
|
||||
if soup:
|
||||
item.content = soup.encode()
|
||||
|
Loading…
x
Reference in New Issue
Block a user