support --retranslate

use string for start,end clean autofind filename fix bug, don't use index clean
2025-06-05 19:15:34 +00:00 · 2023-03-20 01:11:53 +08:00 · 2023-03-20 01:11:53 +08:00 · dfa1b9ada8
commit dfa1b9ada8
parent 6f56ac7a25
3 changed files with 196 additions and 28 deletions
--- a/README.md
+++ b/README.md
@ -44,6 +44,11 @@ The bilingual_book_maker is an AI translation tool that uses ChatGPT to assist u
 output 2200 tokens and maybe 200 tokens for other messages in the system messages user messages, 1600+2200+200=4000, So you are close to reaching the limit. You have to choose your own
 value, there is no way to know if the limit is reached before sending
 - `--translation_style` example: `--translation_style "color: #808080; font-style: italic;"`
+- `--retranslate` `--retranslate "$translated_filepath" "file_name_in_epub" "start_str" "end_str"(optional)`<br>
+Retranslate from start_str to end_str's tag:
+`python3 "make_book.py" --book_name "test_books/animal_farm.epub" --retranslate 'test_books/animal_farm_bilingual.epub' 'index_split_002.html' 'in spite of the present book shortage which' 'This kind of thing is not a good symptom. Obviously'`<br>
+Retranslate start_str's tag:
+`python3 "make_book.py" --book_name "test_books/animal_farm.epub" --retranslate 'test_books/animal_farm_bilingual.epub' 'index_split_002.html' 'in spite of the present book shortage which'`

 ### Examples

--- a/book_maker/cli.py
+++ b/book_maker/cli.py
@ -194,6 +194,18 @@ So you are close to reaching the limit. You have to choose your own value, there
        type=int,
        help="how many lines will be translated by aggregated translation(This options currently only applies to txt files)",
    )
+    parser.add_argument(
+        "--retranslate",
+        dest="retranslate",
+        nargs=4,
+        type=str,
+        help="""--retranslate "$translated_filepath" "file_name_in_epub" "start_str" "end_str"(optional)
+        Retranslate from start_str to end_str's tag:
+        python3 "make_book.py" --book_name "test_books/animal_farm.epub" --retranslate 'test_books/animal_farm_bilingual.epub' 'index_split_002.html' 'in spite of the present book shortage which' 'This kind of thing is not a good symptom. Obviously'
+        Retranslate start_str's tag:
+        python3 "make_book.py" --book_name "test_books/animal_farm.epub" --retranslate 'test_books/animal_farm_bilingual.epub' 'index_split_002.html' 'in spite of the present book shortage which'
+""",
+    )

    options = parser.parse_args()

@ -283,6 +295,9 @@ So you are close to reaching the limit. You have to choose your own value, there
        e.translation_style = options.translation_style
    if options.batch_size:
        e.batch_size = options.batch_size
+    if options.retranslate:
+        e.retranslate = options.retranslate
+
    e.make_bilingual_book()


--- a/book_maker/loader/epub_loader.py
+++ b/book_maker/loader/epub_loader.py
@ -47,6 +47,7 @@ class EPUBBookLoader(BaseBookLoader):
        self.helper = EPUBBookLoaderHelper(
            self.translate_model, self.accumulated_num, self.translation_style
        )
+        self.retranslate = None

        # monkey pathch for # 173
        def _write_items_patch(obj):
@ -168,6 +169,173 @@ class EPUBBookLoader(BaseBookLoader):
                wait_p_list.append(p)
                count = length

+    def get_item(self, book, name):
+        for item in book.get_items():
+            if item.file_name == name:
+                return item
+
+    def find_items_containing_string(self, book, search_string):
+        matching_items = []
+
+        for item in book.get_items_of_type(ITEM_DOCUMENT):
+            content = item.get_content().decode("utf-8")
+            if search_string in content:
+                matching_items.append(item)
+
+        return matching_items
+
+    def retranslate_book(self, index, p_to_save_len, pbar, trans_taglist, retranslate):
+        complete_book_name = retranslate[0]
+        fixname = retranslate[1]
+        fixstart = retranslate[2]
+        fixend = retranslate[3]
+
+        if fixend == "":
+            fixend = fixstart
+
+        name_fix = complete_book_name
+
+        complete_book = epub.read_epub(complete_book_name)
+
+        if fixname == "":
+            fixname = self.find_items_containing_string(complete_book, fixstart)[
+                0
+            ].file_name
+            print(f"auto find fixname: {fixname}")
+
+        new_book = self._make_new_book(complete_book)
+
+        complete_item = self.get_item(complete_book, fixname)
+        if complete_item is None:
+            return
+
+        ori_item = self.get_item(self.origin_book, fixname)
+        if ori_item is None:
+            return
+
+        soup_complete = bs(complete_item.content, "html.parser")
+        soup_ori = bs(ori_item.content, "html.parser")
+
+        p_list_complete = soup_complete.findAll(trans_taglist)
+        p_list_ori = soup_ori.findAll(trans_taglist)
+
+        target = None
+        tagl = []
+
+        # extract from range
+        find_end = False
+        find_start = False
+        for tag in p_list_complete:
+            if find_end:
+                tagl.append(tag)
+                break
+
+            if fixend in tag.text:
+                find_end = True
+            if fixstart in tag.text:
+                find_start = True
+
+            if find_start:
+                if not target:
+                    target = tag.previous_sibling
+                tagl.append(tag)
+
+        for t in tagl:
+            t.extract()
+
+        flag = False
+        extract_p_list_ori = []
+        for p in p_list_ori:
+            if fixstart in p.text:
+                flag = True
+            if flag:
+                extract_p_list_ori.append(p)
+            if fixend in p.text:
+                break
+
+        for t in extract_p_list_ori:
+            target.insert_after(t)
+            target = t
+
+        for item in complete_book.get_items():
+            if item.file_name != fixname:
+                new_book.add_item(item)
+
+        complete_item.content = soup_complete.prettify().encode()
+
+        # =================================================
+        index = self.process_item(
+            complete_item,
+            index,
+            p_to_save_len,
+            pbar,
+            new_book,
+            trans_taglist,
+            fixstart,
+            fixend,
+        )
+        epub.write_epub(f"{name_fix}", new_book, {})
+
+    def process_item(
+        self,
+        item,
+        index,
+        p_to_save_len,
+        pbar,
+        new_book,
+        trans_taglist,
+        fixstart=None,
+        fixend=None,
+    ):
+        if not os.path.exists("log"):
+            os.makedirs("log")
+
+        soup = bs(item.content, "html.parser")
+        p_list = soup.findAll(trans_taglist)
+
+        if self.retranslate:
+            new_p_list = []
+
+            if fixstart is None or fixend is None:
+                return
+
+            start_append = False
+            for p in p_list:
+                text = p.get_text()
+                if fixstart in text or fixend in text or start_append:
+                    start_append = True
+                    new_p_list.append(p)
+                if fixend in text:
+                    p_list = new_p_list
+                    break
+
+        if self.allow_navigable_strings:
+            p_list.extend(soup.findAll(text=True))
+
+        send_num = self.accumulated_num
+        if send_num > 1:
+            with open("log/buglog.txt", "a") as f:
+                print(f"------------- {item.file_name} -------------", file=f)
+
+            print("------------------------------------------------------")
+            print(f"dealing {item.file_name} ...")
+            self.translate_paragraphs_acc(p_list, send_num)
+        else:
+            is_test_done = self.is_test and index > self.test_num
+            for p in p_list:
+                if is_test_done:
+                    break
+                index = self._process_paragraph(p, index, p_to_save_len)
+                # pbar.update(delta) not pbar.update(index)?
+                pbar.update(1)
+                if self.is_test and index >= self.test_num:
+                    break
+
+        item.content = soup.prettify().encode()
+        new_book.add_item(item)
+
+        return index
+
    def make_bilingual_book(self):
        self.helper = EPUBBookLoaderHelper(
            self.translate_model, self.accumulated_num, self.translation_style
@ -191,6 +359,11 @@ class EPUBBookLoader(BaseBookLoader):
        index = 0
        p_to_save_len = len(self.p_to_save)
        try:
+            if self.retranslate:
+                self.retranslate_book(
+                    index, p_to_save_len, pbar, trans_taglist, self.retranslate
+                )
+                exit(0)
            # Add the things that don't need to be translated first, so that you can see the img after the interruption
            for item in self.origin_book.get_items():
                if item.get_type() != ITEM_DOCUMENT:
@ -199,35 +372,10 @@ class EPUBBookLoader(BaseBookLoader):
            for item in self.origin_book.get_items_of_type(ITEM_DOCUMENT):
                # if item.file_name != "OEBPS/ch01.xhtml":
                #     continue
-                if not os.path.exists("log"):
-                    os.makedirs("log")
+                index = self.process_item(
+                    item, index, p_to_save_len, pbar, new_book, trans_taglist
+                )

-                soup = bs(item.content, "html.parser")
-                p_list = soup.findAll(trans_taglist)
-                if self.allow_navigable_strings:
-                    p_list.extend(soup.findAll(text=True))
-
-                send_num = self.accumulated_num
-                if send_num > 1:
-                    with open("log/buglog.txt", "a") as f:
-                        print(f"------------- {item.file_name} -------------", file=f)
-
-                    print("------------------------------------------------------")
-                    print(f"dealing {item.file_name} ...")
-                    self.translate_paragraphs_acc(p_list, send_num)
-                else:
-                    is_test_done = self.is_test and index > self.test_num
-                    for p in p_list:
-                        if is_test_done:
-                            break
-                        index = self._process_paragraph(p, index, p_to_save_len)
-                        # pbar.update(delta) not pbar.update(index)?
-                        pbar.update(1)
-                        if self.is_test and index >= self.test_num:
-                            break
-
-                item.content = soup.prettify().encode()
-                new_book.add_item(item)
                if self.accumulated_num > 1:
                    name, _ = os.path.splitext(self.epub_name)
                    epub.write_epub(f"{name}_bilingual.epub", new_book, {})