support --retranslate

use string for start,end clean autofind filename fix bug, don't use index clean
2025-06-05 19:15:34 +00:00 · 2023-03-20 01:11:53 +08:00 · 2023-03-20 01:11:53 +08:00 · dfa1b9ada8
commit dfa1b9ada8
parent 6f56ac7a25
3 changed files with 196 additions and 28 deletions
--- a/README.md
+++ b/README.md
@ -44,6 +44,11 @@ The bilingual_book_maker is an AI translation tool that uses ChatGPT to assist u
 output 2200 tokens and maybe 200 tokens for other messages in the system messages user messages, 1600+2200+200=4000, So you are close to reaching the limit. You have to choose your own
 value, there is no way to know if the limit is reached before sending
 - `--translation_style` example: `--translation_style "color: #808080; font-style: italic;"`
+- `--retranslate` `--retranslate "$translated_filepath" "file_name_in_epub" "start_str" "end_str"(optional)`<br>
+Retranslate from start_str to end_str's tag:
+`python3 "make_book.py" --book_name "test_books/animal_farm.epub" --retranslate 'test_books/animal_farm_bilingual.epub' 'index_split_002.html' 'in spite of the present book shortage which' 'This kind of thing is not a good symptom. Obviously'`<br>
+Retranslate start_str's tag:
+`python3 "make_book.py" --book_name "test_books/animal_farm.epub" --retranslate 'test_books/animal_farm_bilingual.epub' 'index_split_002.html' 'in spite of the present book shortage which'`

 ### Examples

--- a/book_maker/cli.py
+++ b/book_maker/cli.py
@ -194,6 +194,18 @@ So you are close to reaching the limit. You have to choose your own value, there
        type=int,
        help="how many lines will be translated by aggregated translation(This options currently only applies to txt files)",
    )
+    parser.add_argument(
+        "--retranslate",
+        dest="retranslate",
+        nargs=4,
+        type=str,
+        help="""--retranslate "$translated_filepath" "file_name_in_epub" "start_str" "end_str"(optional)
+        Retranslate from start_str to end_str's tag:
+        python3 "make_book.py" --book_name "test_books/animal_farm.epub" --retranslate 'test_books/animal_farm_bilingual.epub' 'index_split_002.html' 'in spite of the present book shortage which' 'This kind of thing is not a good symptom. Obviously'
+        Retranslate start_str's tag:
+        python3 "make_book.py" --book_name "test_books/animal_farm.epub" --retranslate 'test_books/animal_farm_bilingual.epub' 'index_split_002.html' 'in spite of the present book shortage which'
+""",
+    )

    options = parser.parse_args()

@ -283,6 +295,9 @@ So you are close to reaching the limit. You have to choose your own value, there
        e.translation_style = options.translation_style
    if options.batch_size:
        e.batch_size = options.batch_size
+    if options.retranslate:
+        e.retranslate = options.retranslate
+
    e.make_bilingual_book()


--- a/book_maker/loader/epub_loader.py
+++ b/book_maker/loader/epub_loader.py
@ -47,6 +47,7 @@ class EPUBBookLoader(BaseBookLoader):
        self.helper = EPUBBookLoaderHelper(
            self.translate_model, self.accumulated_num, self.translation_style
        )
+        self.retranslate = None

        # monkey pathch for # 173
        def _write_items_patch(obj):
@ -168,42 +169,146 @@ class EPUBBookLoader(BaseBookLoader):
                wait_p_list.append(p)
                count = length

-    def make_bilingual_book(self):
-        self.helper = EPUBBookLoaderHelper(
-            self.translate_model, self.accumulated_num, self.translation_style
-        )
-        new_book = self._make_new_book(self.origin_book)
-        all_items = list(self.origin_book.get_items())
-        trans_taglist = self.translate_tags.split(",")
-        all_p_length = sum(
+    def get_item(self, book, name):
+        for item in book.get_items():
+            if item.file_name == name:
+                return item
+
+    def find_items_containing_string(self, book, search_string):
+        matching_items = []
+
+        for item in book.get_items_of_type(ITEM_DOCUMENT):
+            content = item.get_content().decode("utf-8")
+            if search_string in content:
+                matching_items.append(item)
+
+        return matching_items
+
+    def retranslate_book(self, index, p_to_save_len, pbar, trans_taglist, retranslate):
+        complete_book_name = retranslate[0]
+        fixname = retranslate[1]
+        fixstart = retranslate[2]
+        fixend = retranslate[3]
+
+        if fixend == "":
+            fixend = fixstart
+
+        name_fix = complete_book_name
+
+        complete_book = epub.read_epub(complete_book_name)
+
+        if fixname == "":
+            fixname = self.find_items_containing_string(complete_book, fixstart)[
                0
-            if i.get_type() != ITEM_DOCUMENT
-            else len(bs(i.content, "html.parser").findAll(trans_taglist))
-            for i in all_items
-        )
-        all_p_length += self.allow_navigable_strings * sum(
-            0
-            if i.get_type() != ITEM_DOCUMENT
-            else len(bs(i.content, "html.parser").findAll(text=True))
-            for i in all_items
-        )
-        pbar = tqdm(total=self.test_num) if self.is_test else tqdm(total=all_p_length)
-        index = 0
-        p_to_save_len = len(self.p_to_save)
-        try:
-            # Add the things that don't need to be translated first, so that you can see the img after the interruption
-            for item in self.origin_book.get_items():
-                if item.get_type() != ITEM_DOCUMENT:
+            ].file_name
+            print(f"auto find fixname: {fixname}")
+
+        new_book = self._make_new_book(complete_book)
+
+        complete_item = self.get_item(complete_book, fixname)
+        if complete_item is None:
+            return
+
+        ori_item = self.get_item(self.origin_book, fixname)
+        if ori_item is None:
+            return
+
+        soup_complete = bs(complete_item.content, "html.parser")
+        soup_ori = bs(ori_item.content, "html.parser")
+
+        p_list_complete = soup_complete.findAll(trans_taglist)
+        p_list_ori = soup_ori.findAll(trans_taglist)
+
+        target = None
+        tagl = []
+
+        # extract from range
+        find_end = False
+        find_start = False
+        for tag in p_list_complete:
+            if find_end:
+                tagl.append(tag)
+                break
+
+            if fixend in tag.text:
+                find_end = True
+            if fixstart in tag.text:
+                find_start = True
+
+            if find_start:
+                if not target:
+                    target = tag.previous_sibling
+                tagl.append(tag)
+
+        for t in tagl:
+            t.extract()
+
+        flag = False
+        extract_p_list_ori = []
+        for p in p_list_ori:
+            if fixstart in p.text:
+                flag = True
+            if flag:
+                extract_p_list_ori.append(p)
+            if fixend in p.text:
+                break
+
+        for t in extract_p_list_ori:
+            target.insert_after(t)
+            target = t
+
+        for item in complete_book.get_items():
+            if item.file_name != fixname:
                new_book.add_item(item)

-            for item in self.origin_book.get_items_of_type(ITEM_DOCUMENT):
-                # if item.file_name != "OEBPS/ch01.xhtml":
-                #     continue
+        complete_item.content = soup_complete.prettify().encode()
+
+        # =================================================
+        index = self.process_item(
+            complete_item,
+            index,
+            p_to_save_len,
+            pbar,
+            new_book,
+            trans_taglist,
+            fixstart,
+            fixend,
+        )
+        epub.write_epub(f"{name_fix}", new_book, {})
+
+    def process_item(
+        self,
+        item,
+        index,
+        p_to_save_len,
+        pbar,
+        new_book,
+        trans_taglist,
+        fixstart=None,
+        fixend=None,
+    ):
        if not os.path.exists("log"):
            os.makedirs("log")

        soup = bs(item.content, "html.parser")
        p_list = soup.findAll(trans_taglist)
+
+        if self.retranslate:
+            new_p_list = []
+
+            if fixstart is None or fixend is None:
+                return
+
+            start_append = False
+            for p in p_list:
+                text = p.get_text()
+                if fixstart in text or fixend in text or start_append:
+                    start_append = True
+                    new_p_list.append(p)
+                if fixend in text:
+                    p_list = new_p_list
+                    break
+
        if self.allow_navigable_strings:
            p_list.extend(soup.findAll(text=True))

@ -228,6 +333,49 @@ class EPUBBookLoader(BaseBookLoader):

        item.content = soup.prettify().encode()
        new_book.add_item(item)
+
+        return index
+
+    def make_bilingual_book(self):
+        self.helper = EPUBBookLoaderHelper(
+            self.translate_model, self.accumulated_num, self.translation_style
+        )
+        new_book = self._make_new_book(self.origin_book)
+        all_items = list(self.origin_book.get_items())
+        trans_taglist = self.translate_tags.split(",")
+        all_p_length = sum(
+            0
+            if i.get_type() != ITEM_DOCUMENT
+            else len(bs(i.content, "html.parser").findAll(trans_taglist))
+            for i in all_items
+        )
+        all_p_length += self.allow_navigable_strings * sum(
+            0
+            if i.get_type() != ITEM_DOCUMENT
+            else len(bs(i.content, "html.parser").findAll(text=True))
+            for i in all_items
+        )
+        pbar = tqdm(total=self.test_num) if self.is_test else tqdm(total=all_p_length)
+        index = 0
+        p_to_save_len = len(self.p_to_save)
+        try:
+            if self.retranslate:
+                self.retranslate_book(
+                    index, p_to_save_len, pbar, trans_taglist, self.retranslate
+                )
+                exit(0)
+            # Add the things that don't need to be translated first, so that you can see the img after the interruption
+            for item in self.origin_book.get_items():
+                if item.get_type() != ITEM_DOCUMENT:
+                    new_book.add_item(item)
+
+            for item in self.origin_book.get_items_of_type(ITEM_DOCUMENT):
+                # if item.file_name != "OEBPS/ch01.xhtml":
+                #     continue
+                index = self.process_item(
+                    item, index, p_to_save_len, pbar, new_book, trans_taglist
+                )
+
                if self.accumulated_num > 1:
                    name, _ = os.path.splitext(self.epub_name)
                    epub.write_epub(f"{name}_bilingual.epub", new_book, {})