mirror of
https://github.com/yihong0618/bilingual_book_maker.git
synced 2025-06-06 11:35:49 +00:00
Feat: combine multiple lines into one block, add a new option --block_size (#370)
* Feat: combine multiple lines into one block bug: some text is not replaced with translation * Fix: some text are not translated known issue: 1. sometime the original text show up 2. resume function not working * Style: clean up code
This commit is contained in:
parent
1d7685b86f
commit
40aaa9b090
@ -269,6 +269,12 @@ So you are close to reaching the limit. You have to choose your own value, there
|
|||||||
default=1.0,
|
default=1.0,
|
||||||
help="temperature parameter for `chatgptapi`/`gpt4`/`claude`",
|
help="temperature parameter for `chatgptapi`/`gpt4`/`claude`",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--block_size",
|
||||||
|
type=int,
|
||||||
|
default=-1,
|
||||||
|
help="merge multiple paragraphs into one block, may increase accuracy and speed up the process, but disturb the original format, must be used with `--single_translate`",
|
||||||
|
)
|
||||||
|
|
||||||
options = parser.parse_args()
|
options = parser.parse_args()
|
||||||
|
|
||||||
@ -338,6 +344,11 @@ So you are close to reaching the limit. You have to choose your own value, there
|
|||||||
f"now only support files of these formats: {','.join(support_type_list)}",
|
f"now only support files of these formats: {','.join(support_type_list)}",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if options.block_size > 0 and not options.single_translate:
|
||||||
|
raise Exception(
|
||||||
|
"block_size must be used with `--single_translate` because it disturbs the original format",
|
||||||
|
)
|
||||||
|
|
||||||
book_loader = BOOK_LOADER_DICT.get(book_type)
|
book_loader = BOOK_LOADER_DICT.get(book_type)
|
||||||
assert book_loader is not None, "unsupported loader"
|
assert book_loader is not None, "unsupported loader"
|
||||||
language = options.language
|
language = options.language
|
||||||
@ -394,6 +405,8 @@ So you are close to reaching the limit. You have to choose your own value, there
|
|||||||
# TODO refactor, quick fix for gpt4 model
|
# TODO refactor, quick fix for gpt4 model
|
||||||
if options.model == "gpt4":
|
if options.model == "gpt4":
|
||||||
e.translate_model.set_gpt4_models("gpt4")
|
e.translate_model.set_gpt4_models("gpt4")
|
||||||
|
if options.block_size > 0:
|
||||||
|
e.block_size = options.block_size
|
||||||
|
|
||||||
e.make_bilingual_book()
|
e.make_bilingual_book()
|
||||||
|
|
||||||
|
@ -62,6 +62,7 @@ class EPUBBookLoader(BaseBookLoader):
|
|||||||
self.exclude_filelist = ""
|
self.exclude_filelist = ""
|
||||||
self.only_filelist = ""
|
self.only_filelist = ""
|
||||||
self.single_translate = single_translate
|
self.single_translate = single_translate
|
||||||
|
self.block_size = -1
|
||||||
|
|
||||||
# monkey patch for # 173
|
# monkey patch for # 173
|
||||||
def _write_items_patch(obj):
|
def _write_items_patch(obj):
|
||||||
@ -126,21 +127,18 @@ class EPUBBookLoader(BaseBookLoader):
|
|||||||
new_book.toc = book.toc
|
new_book.toc = book.toc
|
||||||
return new_book
|
return new_book
|
||||||
|
|
||||||
def _process_paragraph(self, p, index, p_to_save_len):
|
def _extract_paragraph(self, p):
|
||||||
if not p.text or self._is_special_text(p.text):
|
|
||||||
return index
|
|
||||||
|
|
||||||
new_p = copy(p)
|
|
||||||
|
|
||||||
for p_exclude in self.exclude_translate_tags.split(","):
|
for p_exclude in self.exclude_translate_tags.split(","):
|
||||||
# for issue #280
|
# for issue #280
|
||||||
if type(p) == NavigableString:
|
if type(p) == NavigableString:
|
||||||
continue
|
continue
|
||||||
for pt in new_p.find_all(p_exclude):
|
for pt in p.find_all(p_exclude):
|
||||||
pt.extract()
|
pt.extract()
|
||||||
|
return p
|
||||||
|
|
||||||
|
def _process_paragraph(self, p, new_p, index, p_to_save_len):
|
||||||
if self.resume and index < p_to_save_len:
|
if self.resume and index < p_to_save_len:
|
||||||
new_p.string = self.p_to_save[index]
|
p.string = self.p_to_save[index]
|
||||||
else:
|
else:
|
||||||
if type(p) == NavigableString:
|
if type(p) == NavigableString:
|
||||||
new_p = self.translate_model.translate(new_p.text)
|
new_p = self.translate_model.translate(new_p.text)
|
||||||
@ -156,7 +154,46 @@ class EPUBBookLoader(BaseBookLoader):
|
|||||||
|
|
||||||
if index % 20 == 0:
|
if index % 20 == 0:
|
||||||
self._save_progress()
|
self._save_progress()
|
||||||
|
return index
|
||||||
|
|
||||||
|
def _process_combined_paragraph(self, p_block, index, p_to_save_len):
|
||||||
|
text = []
|
||||||
|
|
||||||
|
for p in p_block:
|
||||||
|
if self.resume and index < p_to_save_len:
|
||||||
|
p.string = self.p_to_save[index]
|
||||||
|
else:
|
||||||
|
p_text = p.text.rstrip()
|
||||||
|
text.append(p_text)
|
||||||
|
|
||||||
|
if self.is_test and index >= self.test_num:
|
||||||
|
break
|
||||||
|
|
||||||
|
index += 1
|
||||||
|
|
||||||
|
if len(text) > 0:
|
||||||
|
translated_text = self.translate_model.translate("\n".join(text))
|
||||||
|
translated_text = translated_text.split("\n")
|
||||||
|
text_len = len(translated_text)
|
||||||
|
|
||||||
|
for i in range(text_len):
|
||||||
|
t = translated_text[i]
|
||||||
|
|
||||||
|
if i >= len(p_block):
|
||||||
|
p = p_block[-1]
|
||||||
|
else:
|
||||||
|
p = p_block[i]
|
||||||
|
|
||||||
|
if type(p) == NavigableString:
|
||||||
|
p = t
|
||||||
|
else:
|
||||||
|
p.string = t
|
||||||
|
|
||||||
|
self.helper.insert_trans(
|
||||||
|
p, p.string, self.translation_style, self.single_translate
|
||||||
|
)
|
||||||
|
|
||||||
|
self._save_progress()
|
||||||
return index
|
return index
|
||||||
|
|
||||||
def translate_paragraphs_acc(self, p_list, send_num):
|
def translate_paragraphs_acc(self, p_list, send_num):
|
||||||
@ -377,15 +414,39 @@ class EPUBBookLoader(BaseBookLoader):
|
|||||||
self.translate_paragraphs_acc(p_list, send_num)
|
self.translate_paragraphs_acc(p_list, send_num)
|
||||||
else:
|
else:
|
||||||
is_test_done = self.is_test and index > self.test_num
|
is_test_done = self.is_test and index > self.test_num
|
||||||
|
p_block = []
|
||||||
|
block_len = 0
|
||||||
for p in p_list:
|
for p in p_list:
|
||||||
if is_test_done:
|
if is_test_done:
|
||||||
break
|
break
|
||||||
index = self._process_paragraph(p, index, p_to_save_len)
|
if not p.text or self._is_special_text(p.text):
|
||||||
|
pbar.update(1)
|
||||||
|
continue
|
||||||
|
|
||||||
|
new_p = self._extract_paragraph(copy(p))
|
||||||
|
if self.single_translate and self.block_size > 0:
|
||||||
|
p_len = num_tokens_from_text(new_p.text)
|
||||||
|
block_len += p_len
|
||||||
|
if block_len > self.block_size:
|
||||||
|
index = self._process_combined_paragraph(
|
||||||
|
p_block, index, p_to_save_len
|
||||||
|
)
|
||||||
|
p_block = [p]
|
||||||
|
block_len = p_len
|
||||||
|
print()
|
||||||
|
else:
|
||||||
|
p_block.append(p)
|
||||||
|
else:
|
||||||
|
index = self._process_paragraph(p, new_p, index, p_to_save_len)
|
||||||
|
print()
|
||||||
|
|
||||||
# pbar.update(delta) not pbar.update(index)?
|
# pbar.update(delta) not pbar.update(index)?
|
||||||
pbar.update(1)
|
pbar.update(1)
|
||||||
print()
|
|
||||||
if self.is_test and index >= self.test_num:
|
if self.is_test and index >= self.test_num:
|
||||||
break
|
break
|
||||||
|
if self.single_translate and self.block_size > 0 and len(p_block) > 0:
|
||||||
|
index = self._process_combined_paragraph(p_block, index, p_to_save_len)
|
||||||
|
|
||||||
if soup:
|
if soup:
|
||||||
item.content = soup.encode()
|
item.content = soup.encode()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user