support --retranslate

use string for start,end

clean

autofind filename

fix bug, don't use index

clean
This commit is contained in:
h 2023-03-20 01:11:53 +08:00
parent 6f56ac7a25
commit dfa1b9ada8
3 changed files with 196 additions and 28 deletions

View File

@ -44,6 +44,11 @@ The bilingual_book_maker is an AI translation tool that uses ChatGPT to assist u
output 2200 tokens and maybe 200 tokens for other messages in the system messages user messages, 1600+2200+200=4000, So you are close to reaching the limit. You have to choose your own
value, there is no way to know if the limit is reached before sending
- `--translation_style` example: `--translation_style "color: #808080; font-style: italic;"`
- `--retranslate` `--retranslate "$translated_filepath" "file_name_in_epub" "start_str" "end_str"(optional)`<br>
Retranslate from start_str to end_str's tag:
`python3 "make_book.py" --book_name "test_books/animal_farm.epub" --retranslate 'test_books/animal_farm_bilingual.epub' 'index_split_002.html' 'in spite of the present book shortage which' 'This kind of thing is not a good symptom. Obviously'`<br>
Retranslate start_str's tag:
`python3 "make_book.py" --book_name "test_books/animal_farm.epub" --retranslate 'test_books/animal_farm_bilingual.epub' 'index_split_002.html' 'in spite of the present book shortage which'`
### Examples

View File

@ -194,6 +194,18 @@ So you are close to reaching the limit. You have to choose your own value, there
type=int,
help="how many lines will be translated by aggregated translation(This options currently only applies to txt files)",
)
parser.add_argument(
"--retranslate",
dest="retranslate",
nargs=4,
type=str,
help="""--retranslate "$translated_filepath" "file_name_in_epub" "start_str" "end_str"(optional)
Retranslate from start_str to end_str's tag:
python3 "make_book.py" --book_name "test_books/animal_farm.epub" --retranslate 'test_books/animal_farm_bilingual.epub' 'index_split_002.html' 'in spite of the present book shortage which' 'This kind of thing is not a good symptom. Obviously'
Retranslate start_str's tag:
python3 "make_book.py" --book_name "test_books/animal_farm.epub" --retranslate 'test_books/animal_farm_bilingual.epub' 'index_split_002.html' 'in spite of the present book shortage which'
""",
)
options = parser.parse_args()
@ -283,6 +295,9 @@ So you are close to reaching the limit. You have to choose your own value, there
e.translation_style = options.translation_style
if options.batch_size:
e.batch_size = options.batch_size
if options.retranslate:
e.retranslate = options.retranslate
e.make_bilingual_book()

View File

@ -47,6 +47,7 @@ class EPUBBookLoader(BaseBookLoader):
self.helper = EPUBBookLoaderHelper(
self.translate_model, self.accumulated_num, self.translation_style
)
self.retranslate = None
# monkey pathch for # 173
def _write_items_patch(obj):
@ -168,6 +169,173 @@ class EPUBBookLoader(BaseBookLoader):
wait_p_list.append(p)
count = length
def get_item(self, book, name):
for item in book.get_items():
if item.file_name == name:
return item
def find_items_containing_string(self, book, search_string):
matching_items = []
for item in book.get_items_of_type(ITEM_DOCUMENT):
content = item.get_content().decode("utf-8")
if search_string in content:
matching_items.append(item)
return matching_items
def retranslate_book(self, index, p_to_save_len, pbar, trans_taglist, retranslate):
complete_book_name = retranslate[0]
fixname = retranslate[1]
fixstart = retranslate[2]
fixend = retranslate[3]
if fixend == "":
fixend = fixstart
name_fix = complete_book_name
complete_book = epub.read_epub(complete_book_name)
if fixname == "":
fixname = self.find_items_containing_string(complete_book, fixstart)[
0
].file_name
print(f"auto find fixname: {fixname}")
new_book = self._make_new_book(complete_book)
complete_item = self.get_item(complete_book, fixname)
if complete_item is None:
return
ori_item = self.get_item(self.origin_book, fixname)
if ori_item is None:
return
soup_complete = bs(complete_item.content, "html.parser")
soup_ori = bs(ori_item.content, "html.parser")
p_list_complete = soup_complete.findAll(trans_taglist)
p_list_ori = soup_ori.findAll(trans_taglist)
target = None
tagl = []
# extract from range
find_end = False
find_start = False
for tag in p_list_complete:
if find_end:
tagl.append(tag)
break
if fixend in tag.text:
find_end = True
if fixstart in tag.text:
find_start = True
if find_start:
if not target:
target = tag.previous_sibling
tagl.append(tag)
for t in tagl:
t.extract()
flag = False
extract_p_list_ori = []
for p in p_list_ori:
if fixstart in p.text:
flag = True
if flag:
extract_p_list_ori.append(p)
if fixend in p.text:
break
for t in extract_p_list_ori:
target.insert_after(t)
target = t
for item in complete_book.get_items():
if item.file_name != fixname:
new_book.add_item(item)
complete_item.content = soup_complete.prettify().encode()
# =================================================
index = self.process_item(
complete_item,
index,
p_to_save_len,
pbar,
new_book,
trans_taglist,
fixstart,
fixend,
)
epub.write_epub(f"{name_fix}", new_book, {})
def process_item(
self,
item,
index,
p_to_save_len,
pbar,
new_book,
trans_taglist,
fixstart=None,
fixend=None,
):
if not os.path.exists("log"):
os.makedirs("log")
soup = bs(item.content, "html.parser")
p_list = soup.findAll(trans_taglist)
if self.retranslate:
new_p_list = []
if fixstart is None or fixend is None:
return
start_append = False
for p in p_list:
text = p.get_text()
if fixstart in text or fixend in text or start_append:
start_append = True
new_p_list.append(p)
if fixend in text:
p_list = new_p_list
break
if self.allow_navigable_strings:
p_list.extend(soup.findAll(text=True))
send_num = self.accumulated_num
if send_num > 1:
with open("log/buglog.txt", "a") as f:
print(f"------------- {item.file_name} -------------", file=f)
print("------------------------------------------------------")
print(f"dealing {item.file_name} ...")
self.translate_paragraphs_acc(p_list, send_num)
else:
is_test_done = self.is_test and index > self.test_num
for p in p_list:
if is_test_done:
break
index = self._process_paragraph(p, index, p_to_save_len)
# pbar.update(delta) not pbar.update(index)?
pbar.update(1)
if self.is_test and index >= self.test_num:
break
item.content = soup.prettify().encode()
new_book.add_item(item)
return index
def make_bilingual_book(self):
self.helper = EPUBBookLoaderHelper(
self.translate_model, self.accumulated_num, self.translation_style
@ -191,6 +359,11 @@ class EPUBBookLoader(BaseBookLoader):
index = 0
p_to_save_len = len(self.p_to_save)
try:
if self.retranslate:
self.retranslate_book(
index, p_to_save_len, pbar, trans_taglist, self.retranslate
)
exit(0)
# Add the things that don't need to be translated first, so that you can see the img after the interruption
for item in self.origin_book.get_items():
if item.get_type() != ITEM_DOCUMENT:
@ -199,35 +372,10 @@ class EPUBBookLoader(BaseBookLoader):
for item in self.origin_book.get_items_of_type(ITEM_DOCUMENT):
# if item.file_name != "OEBPS/ch01.xhtml":
# continue
if not os.path.exists("log"):
os.makedirs("log")
index = self.process_item(
item, index, p_to_save_len, pbar, new_book, trans_taglist
)
soup = bs(item.content, "html.parser")
p_list = soup.findAll(trans_taglist)
if self.allow_navigable_strings:
p_list.extend(soup.findAll(text=True))
send_num = self.accumulated_num
if send_num > 1:
with open("log/buglog.txt", "a") as f:
print(f"------------- {item.file_name} -------------", file=f)
print("------------------------------------------------------")
print(f"dealing {item.file_name} ...")
self.translate_paragraphs_acc(p_list, send_num)
else:
is_test_done = self.is_test and index > self.test_num
for p in p_list:
if is_test_done:
break
index = self._process_paragraph(p, index, p_to_save_len)
# pbar.update(delta) not pbar.update(index)?
pbar.update(1)
if self.is_test and index >= self.test_num:
break
item.content = soup.prettify().encode()
new_book.add_item(item)
if self.accumulated_num > 1:
name, _ = os.path.splitext(self.epub_name)
epub.write_epub(f"{name}_bilingual.epub", new_book, {})