support --retranslate

use string for start,end clean autofind filename fix bug, don't use index clean
2025-06-06 11:35:49 +00:00 · 2023-03-20 01:11:53 +08:00 · 2023-03-20 01:11:53 +08:00 · dfa1b9ada8
commit dfa1b9ada8
parent 6f56ac7a25
3 changed files with 196 additions and 28 deletions
--- a/README.md
+++ b/README.md
@ -44,6 +44,11 @@ The bilingual_book_maker is an AI translation tool that uses ChatGPT to assist u
 output 2200 tokens and maybe 200 tokens for other messages in the system messages user messages, 1600+2200+200=4000, So you are close to reaching the limit. You have to choose your own
 value, there is no way to know if the limit is reached before sending
 - `--translation_style` example: `--translation_style "color: #808080; font-style: italic;"`
 - `--retranslate` `--retranslate "$translated_filepath" "file_name_in_epub" "start_str" "end_str"(optional)`<br>
 Retranslate from start_str to end_str's tag:
 `python3 "make_book.py" --book_name "test_books/animal_farm.epub" --retranslate 'test_books/animal_farm_bilingual.epub' 'index_split_002.html' 'in spite of the present book shortage which' 'This kind of thing is not a good symptom. Obviously'`<br>
 Retranslate start_str's tag:
 `python3 "make_book.py" --book_name "test_books/animal_farm.epub" --retranslate 'test_books/animal_farm_bilingual.epub' 'index_split_002.html' 'in spite of the present book shortage which'`
 ### Examples
--- a/book_maker/cli.py
+++ b/book_maker/cli.py
@ -194,6 +194,18 @@ So you are close to reaching the limit. You have to choose your own value, there
        type=int,
        help="how many lines will be translated by aggregated translation(This options currently only applies to txt files)",
    )
    parser.add_argument(
        "--retranslate",
        dest="retranslate",
        nargs=4,
        type=str,
        help="""--retranslate "$translated_filepath" "file_name_in_epub" "start_str" "end_str"(optional)
        Retranslate from start_str to end_str's tag:
        python3 "make_book.py" --book_name "test_books/animal_farm.epub" --retranslate 'test_books/animal_farm_bilingual.epub' 'index_split_002.html' 'in spite of the present book shortage which' 'This kind of thing is not a good symptom. Obviously'
        Retranslate start_str's tag:
        python3 "make_book.py" --book_name "test_books/animal_farm.epub" --retranslate 'test_books/animal_farm_bilingual.epub' 'index_split_002.html' 'in spite of the present book shortage which'
 """,
    )
    options = parser.parse_args()
@ -283,6 +295,9 @@ So you are close to reaching the limit. You have to choose your own value, there
        e.translation_style = options.translation_style
    if options.batch_size:
        e.batch_size = options.batch_size
    if options.retranslate:
        e.retranslate = options.retranslate
    e.make_bilingual_book()
--- a/book_maker/loader/epub_loader.py
+++ b/book_maker/loader/epub_loader.py
@ -47,6 +47,7 @@ class EPUBBookLoader(BaseBookLoader):
        self.helper = EPUBBookLoaderHelper(
            self.translate_model, self.accumulated_num, self.translation_style
        )
        self.retranslate = None
        # monkey pathch for # 173
        def _write_items_patch(obj):
@ -168,42 +169,146 @@ class EPUBBookLoader(BaseBookLoader):
                wait_p_list.append(p)
                count = length
-    def make_bilingual_book(self):
+    def get_item(self, book, name):
-        self.helper = EPUBBookLoaderHelper(
+        for item in book.get_items():
-            self.translate_model, self.accumulated_num, self.translation_style
+            if item.file_name == name:
-        )
+                return item
-        new_book = self._make_new_book(self.origin_book)
+
-        all_items = list(self.origin_book.get_items())
+    def find_items_containing_string(self, book, search_string):
-        trans_taglist = self.translate_tags.split(",")
+        matching_items = []
-        all_p_length = sum(
+
        for item in book.get_items_of_type(ITEM_DOCUMENT):
            content = item.get_content().decode("utf-8")
            if search_string in content:
                matching_items.append(item)
        return matching_items
    def retranslate_book(self, index, p_to_save_len, pbar, trans_taglist, retranslate):
        complete_book_name = retranslate[0]
        fixname = retranslate[1]
        fixstart = retranslate[2]
        fixend = retranslate[3]
        if fixend == "":
            fixend = fixstart
        name_fix = complete_book_name
        complete_book = epub.read_epub(complete_book_name)
        if fixname == "":
            fixname = self.find_items_containing_string(complete_book, fixstart)[
                0
-            if i.get_type() != ITEM_DOCUMENT
+            ].file_name
-            else len(bs(i.content, "html.parser").findAll(trans_taglist))
+            print(f"auto find fixname: {fixname}")
-            for i in all_items
+
-        )
+        new_book = self._make_new_book(complete_book)
-        all_p_length += self.allow_navigable_strings * sum(
+
-            0
+        complete_item = self.get_item(complete_book, fixname)
-            if i.get_type() != ITEM_DOCUMENT
+        if complete_item is None:
-            else len(bs(i.content, "html.parser").findAll(text=True))
+            return
-            for i in all_items
+
-        )
+        ori_item = self.get_item(self.origin_book, fixname)
-        pbar = tqdm(total=self.test_num) if self.is_test else tqdm(total=all_p_length)
+        if ori_item is None:
-        index = 0
+            return
-        p_to_save_len = len(self.p_to_save)
+
-        try:
+        soup_complete = bs(complete_item.content, "html.parser")
-            # Add the things that don't need to be translated first, so that you can see the img after the interruption
+        soup_ori = bs(ori_item.content, "html.parser")
-            for item in self.origin_book.get_items():
+
-                if item.get_type() != ITEM_DOCUMENT:
+        p_list_complete = soup_complete.findAll(trans_taglist)
        p_list_ori = soup_ori.findAll(trans_taglist)
        target = None
        tagl = []
        # extract from range
        find_end = False
        find_start = False
        for tag in p_list_complete:
            if find_end:
                tagl.append(tag)
                break
            if fixend in tag.text:
                find_end = True
            if fixstart in tag.text:
                find_start = True
            if find_start:
                if not target:
                    target = tag.previous_sibling
                tagl.append(tag)
        for t in tagl:
            t.extract()
        flag = False
        extract_p_list_ori = []
        for p in p_list_ori:
            if fixstart in p.text:
                flag = True
            if flag:
                extract_p_list_ori.append(p)
            if fixend in p.text:
                break
        for t in extract_p_list_ori:
            target.insert_after(t)
            target = t
        for item in complete_book.get_items():
            if item.file_name != fixname:
                new_book.add_item(item)
-            for item in self.origin_book.get_items_of_type(ITEM_DOCUMENT):
+        complete_item.content = soup_complete.prettify().encode()
-                # if item.file_name != "OEBPS/ch01.xhtml":
+
-                #     continue
+        # =================================================
        index = self.process_item(
            complete_item,
            index,
            p_to_save_len,
            pbar,
            new_book,
            trans_taglist,
            fixstart,
            fixend,
        )
        epub.write_epub(f"{name_fix}", new_book, {})
    def process_item(
        self,
        item,
        index,
        p_to_save_len,
        pbar,
        new_book,
        trans_taglist,
        fixstart=None,
        fixend=None,
    ):
        if not os.path.exists("log"):
            os.makedirs("log")
        soup = bs(item.content, "html.parser")
        p_list = soup.findAll(trans_taglist)
        if self.retranslate:
            new_p_list = []
            if fixstart is None or fixend is None:
                return
            start_append = False
            for p in p_list:
                text = p.get_text()
                if fixstart in text or fixend in text or start_append:
                    start_append = True
                    new_p_list.append(p)
                if fixend in text:
                    p_list = new_p_list
                    break
        if self.allow_navigable_strings:
            p_list.extend(soup.findAll(text=True))
@ -228,6 +333,49 @@ class EPUBBookLoader(BaseBookLoader):
        item.content = soup.prettify().encode()
        new_book.add_item(item)
        return index
    def make_bilingual_book(self):
        self.helper = EPUBBookLoaderHelper(
            self.translate_model, self.accumulated_num, self.translation_style
        )
        new_book = self._make_new_book(self.origin_book)
        all_items = list(self.origin_book.get_items())
        trans_taglist = self.translate_tags.split(",")
        all_p_length = sum(
            0
            if i.get_type() != ITEM_DOCUMENT
            else len(bs(i.content, "html.parser").findAll(trans_taglist))
            for i in all_items
        )
        all_p_length += self.allow_navigable_strings * sum(
            0
            if i.get_type() != ITEM_DOCUMENT
            else len(bs(i.content, "html.parser").findAll(text=True))
            for i in all_items
        )
        pbar = tqdm(total=self.test_num) if self.is_test else tqdm(total=all_p_length)
        index = 0
        p_to_save_len = len(self.p_to_save)
        try:
            if self.retranslate:
                self.retranslate_book(
                    index, p_to_save_len, pbar, trans_taglist, self.retranslate
                )
                exit(0)
            # Add the things that don't need to be translated first, so that you can see the img after the interruption
            for item in self.origin_book.get_items():
                if item.get_type() != ITEM_DOCUMENT:
                    new_book.add_item(item)
            for item in self.origin_book.get_items_of_type(ITEM_DOCUMENT):
                # if item.file_name != "OEBPS/ch01.xhtml":
                #     continue
                index = self.process_item(
                    item, index, p_to_save_len, pbar, new_book, trans_taglist
                )
                if self.accumulated_num > 1:
                    name, _ = os.path.splitext(self.epub_name)
                    epub.write_epub(f"{name}_bilingual.epub", new_book, {})