Feat: combine multiple lines into one block, add a new option --block_size (#370)

* Feat: combine multiple lines into one block

bug:  some text is not replaced with translation

* Fix: some text are not translated

known issue:
1. sometime the original text show up
2. resume function not working

* Style: clean up code
This commit is contained in:
Ninzore 2024-01-19 04:39:50 +00:00 committed by GitHub
parent 1d7685b86f
commit 40aaa9b090
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 84 additions and 10 deletions

View File

@ -269,6 +269,12 @@ So you are close to reaching the limit. You have to choose your own value, there
default=1.0,
help="temperature parameter for `chatgptapi`/`gpt4`/`claude`",
)
parser.add_argument(
"--block_size",
type=int,
default=-1,
help="merge multiple paragraphs into one block, may increase accuracy and speed up the process, but disturb the original format, must be used with `--single_translate`",
)
options = parser.parse_args()
@ -338,6 +344,11 @@ So you are close to reaching the limit. You have to choose your own value, there
f"now only support files of these formats: {','.join(support_type_list)}",
)
if options.block_size > 0 and not options.single_translate:
raise Exception(
"block_size must be used with `--single_translate` because it disturbs the original format",
)
book_loader = BOOK_LOADER_DICT.get(book_type)
assert book_loader is not None, "unsupported loader"
language = options.language
@ -394,6 +405,8 @@ So you are close to reaching the limit. You have to choose your own value, there
# TODO refactor, quick fix for gpt4 model
if options.model == "gpt4":
e.translate_model.set_gpt4_models("gpt4")
if options.block_size > 0:
e.block_size = options.block_size
e.make_bilingual_book()

View File

@ -62,6 +62,7 @@ class EPUBBookLoader(BaseBookLoader):
self.exclude_filelist = ""
self.only_filelist = ""
self.single_translate = single_translate
self.block_size = -1
# monkey patch for # 173
def _write_items_patch(obj):
@ -126,21 +127,18 @@ class EPUBBookLoader(BaseBookLoader):
new_book.toc = book.toc
return new_book
def _process_paragraph(self, p, index, p_to_save_len):
if not p.text or self._is_special_text(p.text):
return index
new_p = copy(p)
def _extract_paragraph(self, p):
for p_exclude in self.exclude_translate_tags.split(","):
# for issue #280
if type(p) == NavigableString:
continue
for pt in new_p.find_all(p_exclude):
for pt in p.find_all(p_exclude):
pt.extract()
return p
def _process_paragraph(self, p, new_p, index, p_to_save_len):
if self.resume and index < p_to_save_len:
new_p.string = self.p_to_save[index]
p.string = self.p_to_save[index]
else:
if type(p) == NavigableString:
new_p = self.translate_model.translate(new_p.text)
@ -156,7 +154,46 @@ class EPUBBookLoader(BaseBookLoader):
if index % 20 == 0:
self._save_progress()
return index
def _process_combined_paragraph(self, p_block, index, p_to_save_len):
text = []
for p in p_block:
if self.resume and index < p_to_save_len:
p.string = self.p_to_save[index]
else:
p_text = p.text.rstrip()
text.append(p_text)
if self.is_test and index >= self.test_num:
break
index += 1
if len(text) > 0:
translated_text = self.translate_model.translate("\n".join(text))
translated_text = translated_text.split("\n")
text_len = len(translated_text)
for i in range(text_len):
t = translated_text[i]
if i >= len(p_block):
p = p_block[-1]
else:
p = p_block[i]
if type(p) == NavigableString:
p = t
else:
p.string = t
self.helper.insert_trans(
p, p.string, self.translation_style, self.single_translate
)
self._save_progress()
return index
def translate_paragraphs_acc(self, p_list, send_num):
@ -377,15 +414,39 @@ class EPUBBookLoader(BaseBookLoader):
self.translate_paragraphs_acc(p_list, send_num)
else:
is_test_done = self.is_test and index > self.test_num
p_block = []
block_len = 0
for p in p_list:
if is_test_done:
break
index = self._process_paragraph(p, index, p_to_save_len)
if not p.text or self._is_special_text(p.text):
pbar.update(1)
continue
new_p = self._extract_paragraph(copy(p))
if self.single_translate and self.block_size > 0:
p_len = num_tokens_from_text(new_p.text)
block_len += p_len
if block_len > self.block_size:
index = self._process_combined_paragraph(
p_block, index, p_to_save_len
)
p_block = [p]
block_len = p_len
print()
else:
p_block.append(p)
else:
index = self._process_paragraph(p, new_p, index, p_to_save_len)
print()
# pbar.update(delta) not pbar.update(index)?
pbar.update(1)
print()
if self.is_test and index >= self.test_num:
break
if self.single_translate and self.block_size > 0 and len(p_block) > 0:
index = self._process_combined_paragraph(p_block, index, p_to_save_len)
if soup:
item.content = soup.encode()