Feat: combine multiple lines into one block, add a new option --block_size (#370)

* Feat: combine multiple lines into one block bug: some text is not replaced with translation * Fix: some text are not translated known issue: 1. sometime the original text show up 2. resume function not working * Style: clean up code
2025-06-06 11:35:49 +00:00 · 2024-01-19 04:39:50 +00:00 · 2024-01-19 04:39:50 +00:00 · 40aaa9b090
commit 40aaa9b090
parent 1d7685b86f
2 changed files with 84 additions and 10 deletions
--- a/book_maker/cli.py
+++ b/book_maker/cli.py
@ -269,6 +269,12 @@ So you are close to reaching the limit. You have to choose your own value, there
        default=1.0,
        help="temperature parameter for `chatgptapi`/`gpt4`/`claude`",
    )
    parser.add_argument(
        "--block_size",
        type=int,
        default=-1,
        help="merge multiple paragraphs into one block, may increase accuracy and speed up the process, but disturb the original format, must be used with `--single_translate`",
    )
    options = parser.parse_args()
@ -338,6 +344,11 @@ So you are close to reaching the limit. You have to choose your own value, there
            f"now only support files of these formats: {','.join(support_type_list)}",
        )
    if options.block_size > 0 and not options.single_translate:
        raise Exception(
            "block_size must be used with `--single_translate` because it disturbs the original format",
        )
    book_loader = BOOK_LOADER_DICT.get(book_type)
    assert book_loader is not None, "unsupported loader"
    language = options.language
@ -394,6 +405,8 @@ So you are close to reaching the limit. You have to choose your own value, there
    # TODO refactor, quick fix for gpt4 model
    if options.model == "gpt4":
        e.translate_model.set_gpt4_models("gpt4")
    if options.block_size > 0:
        e.block_size = options.block_size
    e.make_bilingual_book()
--- a/book_maker/loader/epub_loader.py
+++ b/book_maker/loader/epub_loader.py
@ -62,6 +62,7 @@ class EPUBBookLoader(BaseBookLoader):
        self.exclude_filelist = ""
        self.only_filelist = ""
        self.single_translate = single_translate
        self.block_size = -1
        # monkey patch for # 173
        def _write_items_patch(obj):
@ -126,21 +127,18 @@ class EPUBBookLoader(BaseBookLoader):
        new_book.toc = book.toc
        return new_book
-    def _process_paragraph(self, p, index, p_to_save_len):
+    def _extract_paragraph(self, p):
        if not p.text or self._is_special_text(p.text):
            return index
        new_p = copy(p)
        for p_exclude in self.exclude_translate_tags.split(","):
            # for issue #280
            if type(p) == NavigableString:
                continue
-            for pt in new_p.find_all(p_exclude):
+            for pt in p.find_all(p_exclude):
                pt.extract()
        return p
    def _process_paragraph(self, p, new_p, index, p_to_save_len):
        if self.resume and index < p_to_save_len:
-            new_p.string = self.p_to_save[index]
+            p.string = self.p_to_save[index]
        else:
            if type(p) == NavigableString:
                new_p = self.translate_model.translate(new_p.text)
@ -156,7 +154,46 @@ class EPUBBookLoader(BaseBookLoader):
        if index % 20 == 0:
            self._save_progress()
        return index
    def _process_combined_paragraph(self, p_block, index, p_to_save_len):
        text = []
        for p in p_block:
            if self.resume and index < p_to_save_len:
                p.string = self.p_to_save[index]
            else:
                p_text = p.text.rstrip()
                text.append(p_text)
            if self.is_test and index >= self.test_num:
                break
            index += 1
        if len(text) > 0:
            translated_text = self.translate_model.translate("\n".join(text))
            translated_text = translated_text.split("\n")
            text_len = len(translated_text)
            for i in range(text_len):
                t = translated_text[i]
                if i >= len(p_block):
                    p = p_block[-1]
                else:
                    p = p_block[i]
                if type(p) == NavigableString:
                    p = t
                else:
                    p.string = t
                self.helper.insert_trans(
                    p, p.string, self.translation_style, self.single_translate
                )
        self._save_progress()
        return index
    def translate_paragraphs_acc(self, p_list, send_num):
@ -377,15 +414,39 @@ class EPUBBookLoader(BaseBookLoader):
            self.translate_paragraphs_acc(p_list, send_num)
        else:
            is_test_done = self.is_test and index > self.test_num
            p_block = []
            block_len = 0
            for p in p_list:
                if is_test_done:
                    break
-                index = self._process_paragraph(p, index, p_to_save_len)
+                if not p.text or self._is_special_text(p.text):
                    pbar.update(1)
                    continue
                new_p = self._extract_paragraph(copy(p))
                if self.single_translate and self.block_size > 0:
                    p_len = num_tokens_from_text(new_p.text)
                    block_len += p_len
                    if block_len > self.block_size:
                        index = self._process_combined_paragraph(
                            p_block, index, p_to_save_len
                        )
                        p_block = [p]
                        block_len = p_len
                        print()
                    else:
                        p_block.append(p)
                else:
                    index = self._process_paragraph(p, new_p, index, p_to_save_len)
                    print()
                # pbar.update(delta) not pbar.update(index)?
                pbar.update(1)
-                print()
+
                if self.is_test and index >= self.test_num:
                    break
            if self.single_translate and self.block_size > 0 and len(p_block) > 0:
                index = self._process_combined_paragraph(p_block, index, p_to_save_len)
        if soup:
            item.content = soup.encode()