diff --git a/README.md b/README.md index ad5768c..fd43884 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,11 @@ The bilingual_book_maker is an AI translation tool that uses ChatGPT to assist u output 2200 tokens and maybe 200 tokens for other messages in the system messages user messages, 1600+2200+200=4000, So you are close to reaching the limit. You have to choose your own value, there is no way to know if the limit is reached before sending - `--translation_style` example: `--translation_style "color: #808080; font-style: italic;"` +- `--retranslate` `--retranslate "$translated_filepath" "file_name_in_epub" "start_str" "end_str"(optional)`
+Retranslate from start_str to end_str's tag: +`python3 "make_book.py" --book_name "test_books/animal_farm.epub" --retranslate 'test_books/animal_farm_bilingual.epub' 'index_split_002.html' 'in spite of the present book shortage which' 'This kind of thing is not a good symptom. Obviously'`
+Retranslate start_str's tag: +`python3 "make_book.py" --book_name "test_books/animal_farm.epub" --retranslate 'test_books/animal_farm_bilingual.epub' 'index_split_002.html' 'in spite of the present book shortage which'` ### Examples diff --git a/book_maker/cli.py b/book_maker/cli.py index e18db79..6af9924 100644 --- a/book_maker/cli.py +++ b/book_maker/cli.py @@ -194,6 +194,18 @@ So you are close to reaching the limit. You have to choose your own value, there type=int, help="how many lines will be translated by aggregated translation(This options currently only applies to txt files)", ) + parser.add_argument( + "--retranslate", + dest="retranslate", + nargs=4, + type=str, + help="""--retranslate "$translated_filepath" "file_name_in_epub" "start_str" "end_str"(optional) + Retranslate from start_str to end_str's tag: + python3 "make_book.py" --book_name "test_books/animal_farm.epub" --retranslate 'test_books/animal_farm_bilingual.epub' 'index_split_002.html' 'in spite of the present book shortage which' 'This kind of thing is not a good symptom. Obviously' + Retranslate start_str's tag: + python3 "make_book.py" --book_name "test_books/animal_farm.epub" --retranslate 'test_books/animal_farm_bilingual.epub' 'index_split_002.html' 'in spite of the present book shortage which' +""", + ) options = parser.parse_args() @@ -283,6 +295,9 @@ So you are close to reaching the limit. You have to choose your own value, there e.translation_style = options.translation_style if options.batch_size: e.batch_size = options.batch_size + if options.retranslate: + e.retranslate = options.retranslate + e.make_bilingual_book() diff --git a/book_maker/loader/epub_loader.py b/book_maker/loader/epub_loader.py index 7b7e6ee..6b501fb 100644 --- a/book_maker/loader/epub_loader.py +++ b/book_maker/loader/epub_loader.py @@ -47,6 +47,7 @@ class EPUBBookLoader(BaseBookLoader): self.helper = EPUBBookLoaderHelper( self.translate_model, self.accumulated_num, self.translation_style ) + self.retranslate = None # monkey pathch for # 173 def _write_items_patch(obj): @@ -168,6 +169,173 @@ class EPUBBookLoader(BaseBookLoader): wait_p_list.append(p) count = length + def get_item(self, book, name): + for item in book.get_items(): + if item.file_name == name: + return item + + def find_items_containing_string(self, book, search_string): + matching_items = [] + + for item in book.get_items_of_type(ITEM_DOCUMENT): + content = item.get_content().decode("utf-8") + if search_string in content: + matching_items.append(item) + + return matching_items + + def retranslate_book(self, index, p_to_save_len, pbar, trans_taglist, retranslate): + complete_book_name = retranslate[0] + fixname = retranslate[1] + fixstart = retranslate[2] + fixend = retranslate[3] + + if fixend == "": + fixend = fixstart + + name_fix = complete_book_name + + complete_book = epub.read_epub(complete_book_name) + + if fixname == "": + fixname = self.find_items_containing_string(complete_book, fixstart)[ + 0 + ].file_name + print(f"auto find fixname: {fixname}") + + new_book = self._make_new_book(complete_book) + + complete_item = self.get_item(complete_book, fixname) + if complete_item is None: + return + + ori_item = self.get_item(self.origin_book, fixname) + if ori_item is None: + return + + soup_complete = bs(complete_item.content, "html.parser") + soup_ori = bs(ori_item.content, "html.parser") + + p_list_complete = soup_complete.findAll(trans_taglist) + p_list_ori = soup_ori.findAll(trans_taglist) + + target = None + tagl = [] + + # extract from range + find_end = False + find_start = False + for tag in p_list_complete: + if find_end: + tagl.append(tag) + break + + if fixend in tag.text: + find_end = True + if fixstart in tag.text: + find_start = True + + if find_start: + if not target: + target = tag.previous_sibling + tagl.append(tag) + + for t in tagl: + t.extract() + + flag = False + extract_p_list_ori = [] + for p in p_list_ori: + if fixstart in p.text: + flag = True + if flag: + extract_p_list_ori.append(p) + if fixend in p.text: + break + + for t in extract_p_list_ori: + target.insert_after(t) + target = t + + for item in complete_book.get_items(): + if item.file_name != fixname: + new_book.add_item(item) + + complete_item.content = soup_complete.prettify().encode() + + # ================================================= + index = self.process_item( + complete_item, + index, + p_to_save_len, + pbar, + new_book, + trans_taglist, + fixstart, + fixend, + ) + epub.write_epub(f"{name_fix}", new_book, {}) + + def process_item( + self, + item, + index, + p_to_save_len, + pbar, + new_book, + trans_taglist, + fixstart=None, + fixend=None, + ): + if not os.path.exists("log"): + os.makedirs("log") + + soup = bs(item.content, "html.parser") + p_list = soup.findAll(trans_taglist) + + if self.retranslate: + new_p_list = [] + + if fixstart is None or fixend is None: + return + + start_append = False + for p in p_list: + text = p.get_text() + if fixstart in text or fixend in text or start_append: + start_append = True + new_p_list.append(p) + if fixend in text: + p_list = new_p_list + break + + if self.allow_navigable_strings: + p_list.extend(soup.findAll(text=True)) + + send_num = self.accumulated_num + if send_num > 1: + with open("log/buglog.txt", "a") as f: + print(f"------------- {item.file_name} -------------", file=f) + + print("------------------------------------------------------") + print(f"dealing {item.file_name} ...") + self.translate_paragraphs_acc(p_list, send_num) + else: + is_test_done = self.is_test and index > self.test_num + for p in p_list: + if is_test_done: + break + index = self._process_paragraph(p, index, p_to_save_len) + # pbar.update(delta) not pbar.update(index)? + pbar.update(1) + if self.is_test and index >= self.test_num: + break + + item.content = soup.prettify().encode() + new_book.add_item(item) + + return index + def make_bilingual_book(self): self.helper = EPUBBookLoaderHelper( self.translate_model, self.accumulated_num, self.translation_style @@ -191,6 +359,11 @@ class EPUBBookLoader(BaseBookLoader): index = 0 p_to_save_len = len(self.p_to_save) try: + if self.retranslate: + self.retranslate_book( + index, p_to_save_len, pbar, trans_taglist, self.retranslate + ) + exit(0) # Add the things that don't need to be translated first, so that you can see the img after the interruption for item in self.origin_book.get_items(): if item.get_type() != ITEM_DOCUMENT: @@ -199,35 +372,10 @@ class EPUBBookLoader(BaseBookLoader): for item in self.origin_book.get_items_of_type(ITEM_DOCUMENT): # if item.file_name != "OEBPS/ch01.xhtml": # continue - if not os.path.exists("log"): - os.makedirs("log") + index = self.process_item( + item, index, p_to_save_len, pbar, new_book, trans_taglist + ) - soup = bs(item.content, "html.parser") - p_list = soup.findAll(trans_taglist) - if self.allow_navigable_strings: - p_list.extend(soup.findAll(text=True)) - - send_num = self.accumulated_num - if send_num > 1: - with open("log/buglog.txt", "a") as f: - print(f"------------- {item.file_name} -------------", file=f) - - print("------------------------------------------------------") - print(f"dealing {item.file_name} ...") - self.translate_paragraphs_acc(p_list, send_num) - else: - is_test_done = self.is_test and index > self.test_num - for p in p_list: - if is_test_done: - break - index = self._process_paragraph(p, index, p_to_save_len) - # pbar.update(delta) not pbar.update(index)? - pbar.update(1) - if self.is_test and index >= self.test_num: - break - - item.content = soup.prettify().encode() - new_book.add_item(item) if self.accumulated_num > 1: name, _ = os.path.splitext(self.epub_name) epub.write_epub(f"{name}_bilingual.epub", new_book, {})