support --retranslate

use string for start,end

clean

autofind filename

fix bug, don't use index

clean
This commit is contained in:
h 2023-03-20 01:11:53 +08:00
parent 6f56ac7a25
commit dfa1b9ada8
3 changed files with 196 additions and 28 deletions

View File

@ -44,6 +44,11 @@ The bilingual_book_maker is an AI translation tool that uses ChatGPT to assist u
output 2200 tokens and maybe 200 tokens for other messages in the system messages user messages, 1600+2200+200=4000, So you are close to reaching the limit. You have to choose your own output 2200 tokens and maybe 200 tokens for other messages in the system messages user messages, 1600+2200+200=4000, So you are close to reaching the limit. You have to choose your own
value, there is no way to know if the limit is reached before sending value, there is no way to know if the limit is reached before sending
- `--translation_style` example: `--translation_style "color: #808080; font-style: italic;"` - `--translation_style` example: `--translation_style "color: #808080; font-style: italic;"`
- `--retranslate` `--retranslate "$translated_filepath" "file_name_in_epub" "start_str" "end_str"(optional)`<br>
Retranslate from start_str to end_str's tag:
`python3 "make_book.py" --book_name "test_books/animal_farm.epub" --retranslate 'test_books/animal_farm_bilingual.epub' 'index_split_002.html' 'in spite of the present book shortage which' 'This kind of thing is not a good symptom. Obviously'`<br>
Retranslate start_str's tag:
`python3 "make_book.py" --book_name "test_books/animal_farm.epub" --retranslate 'test_books/animal_farm_bilingual.epub' 'index_split_002.html' 'in spite of the present book shortage which'`
### Examples ### Examples

View File

@ -194,6 +194,18 @@ So you are close to reaching the limit. You have to choose your own value, there
type=int, type=int,
help="how many lines will be translated by aggregated translation(This options currently only applies to txt files)", help="how many lines will be translated by aggregated translation(This options currently only applies to txt files)",
) )
parser.add_argument(
"--retranslate",
dest="retranslate",
nargs=4,
type=str,
help="""--retranslate "$translated_filepath" "file_name_in_epub" "start_str" "end_str"(optional)
Retranslate from start_str to end_str's tag:
python3 "make_book.py" --book_name "test_books/animal_farm.epub" --retranslate 'test_books/animal_farm_bilingual.epub' 'index_split_002.html' 'in spite of the present book shortage which' 'This kind of thing is not a good symptom. Obviously'
Retranslate start_str's tag:
python3 "make_book.py" --book_name "test_books/animal_farm.epub" --retranslate 'test_books/animal_farm_bilingual.epub' 'index_split_002.html' 'in spite of the present book shortage which'
""",
)
options = parser.parse_args() options = parser.parse_args()
@ -283,6 +295,9 @@ So you are close to reaching the limit. You have to choose your own value, there
e.translation_style = options.translation_style e.translation_style = options.translation_style
if options.batch_size: if options.batch_size:
e.batch_size = options.batch_size e.batch_size = options.batch_size
if options.retranslate:
e.retranslate = options.retranslate
e.make_bilingual_book() e.make_bilingual_book()

View File

@ -47,6 +47,7 @@ class EPUBBookLoader(BaseBookLoader):
self.helper = EPUBBookLoaderHelper( self.helper = EPUBBookLoaderHelper(
self.translate_model, self.accumulated_num, self.translation_style self.translate_model, self.accumulated_num, self.translation_style
) )
self.retranslate = None
# monkey pathch for # 173 # monkey pathch for # 173
def _write_items_patch(obj): def _write_items_patch(obj):
@ -168,42 +169,146 @@ class EPUBBookLoader(BaseBookLoader):
wait_p_list.append(p) wait_p_list.append(p)
count = length count = length
def make_bilingual_book(self): def get_item(self, book, name):
self.helper = EPUBBookLoaderHelper( for item in book.get_items():
self.translate_model, self.accumulated_num, self.translation_style if item.file_name == name:
) return item
new_book = self._make_new_book(self.origin_book)
all_items = list(self.origin_book.get_items()) def find_items_containing_string(self, book, search_string):
trans_taglist = self.translate_tags.split(",") matching_items = []
all_p_length = sum(
for item in book.get_items_of_type(ITEM_DOCUMENT):
content = item.get_content().decode("utf-8")
if search_string in content:
matching_items.append(item)
return matching_items
def retranslate_book(self, index, p_to_save_len, pbar, trans_taglist, retranslate):
complete_book_name = retranslate[0]
fixname = retranslate[1]
fixstart = retranslate[2]
fixend = retranslate[3]
if fixend == "":
fixend = fixstart
name_fix = complete_book_name
complete_book = epub.read_epub(complete_book_name)
if fixname == "":
fixname = self.find_items_containing_string(complete_book, fixstart)[
0 0
if i.get_type() != ITEM_DOCUMENT ].file_name
else len(bs(i.content, "html.parser").findAll(trans_taglist)) print(f"auto find fixname: {fixname}")
for i in all_items
) new_book = self._make_new_book(complete_book)
all_p_length += self.allow_navigable_strings * sum(
0 complete_item = self.get_item(complete_book, fixname)
if i.get_type() != ITEM_DOCUMENT if complete_item is None:
else len(bs(i.content, "html.parser").findAll(text=True)) return
for i in all_items
) ori_item = self.get_item(self.origin_book, fixname)
pbar = tqdm(total=self.test_num) if self.is_test else tqdm(total=all_p_length) if ori_item is None:
index = 0 return
p_to_save_len = len(self.p_to_save)
try: soup_complete = bs(complete_item.content, "html.parser")
# Add the things that don't need to be translated first, so that you can see the img after the interruption soup_ori = bs(ori_item.content, "html.parser")
for item in self.origin_book.get_items():
if item.get_type() != ITEM_DOCUMENT: p_list_complete = soup_complete.findAll(trans_taglist)
p_list_ori = soup_ori.findAll(trans_taglist)
target = None
tagl = []
# extract from range
find_end = False
find_start = False
for tag in p_list_complete:
if find_end:
tagl.append(tag)
break
if fixend in tag.text:
find_end = True
if fixstart in tag.text:
find_start = True
if find_start:
if not target:
target = tag.previous_sibling
tagl.append(tag)
for t in tagl:
t.extract()
flag = False
extract_p_list_ori = []
for p in p_list_ori:
if fixstart in p.text:
flag = True
if flag:
extract_p_list_ori.append(p)
if fixend in p.text:
break
for t in extract_p_list_ori:
target.insert_after(t)
target = t
for item in complete_book.get_items():
if item.file_name != fixname:
new_book.add_item(item) new_book.add_item(item)
for item in self.origin_book.get_items_of_type(ITEM_DOCUMENT): complete_item.content = soup_complete.prettify().encode()
# if item.file_name != "OEBPS/ch01.xhtml":
# continue # =================================================
index = self.process_item(
complete_item,
index,
p_to_save_len,
pbar,
new_book,
trans_taglist,
fixstart,
fixend,
)
epub.write_epub(f"{name_fix}", new_book, {})
def process_item(
self,
item,
index,
p_to_save_len,
pbar,
new_book,
trans_taglist,
fixstart=None,
fixend=None,
):
if not os.path.exists("log"): if not os.path.exists("log"):
os.makedirs("log") os.makedirs("log")
soup = bs(item.content, "html.parser") soup = bs(item.content, "html.parser")
p_list = soup.findAll(trans_taglist) p_list = soup.findAll(trans_taglist)
if self.retranslate:
new_p_list = []
if fixstart is None or fixend is None:
return
start_append = False
for p in p_list:
text = p.get_text()
if fixstart in text or fixend in text or start_append:
start_append = True
new_p_list.append(p)
if fixend in text:
p_list = new_p_list
break
if self.allow_navigable_strings: if self.allow_navigable_strings:
p_list.extend(soup.findAll(text=True)) p_list.extend(soup.findAll(text=True))
@ -228,6 +333,49 @@ class EPUBBookLoader(BaseBookLoader):
item.content = soup.prettify().encode() item.content = soup.prettify().encode()
new_book.add_item(item) new_book.add_item(item)
return index
def make_bilingual_book(self):
self.helper = EPUBBookLoaderHelper(
self.translate_model, self.accumulated_num, self.translation_style
)
new_book = self._make_new_book(self.origin_book)
all_items = list(self.origin_book.get_items())
trans_taglist = self.translate_tags.split(",")
all_p_length = sum(
0
if i.get_type() != ITEM_DOCUMENT
else len(bs(i.content, "html.parser").findAll(trans_taglist))
for i in all_items
)
all_p_length += self.allow_navigable_strings * sum(
0
if i.get_type() != ITEM_DOCUMENT
else len(bs(i.content, "html.parser").findAll(text=True))
for i in all_items
)
pbar = tqdm(total=self.test_num) if self.is_test else tqdm(total=all_p_length)
index = 0
p_to_save_len = len(self.p_to_save)
try:
if self.retranslate:
self.retranslate_book(
index, p_to_save_len, pbar, trans_taglist, self.retranslate
)
exit(0)
# Add the things that don't need to be translated first, so that you can see the img after the interruption
for item in self.origin_book.get_items():
if item.get_type() != ITEM_DOCUMENT:
new_book.add_item(item)
for item in self.origin_book.get_items_of_type(ITEM_DOCUMENT):
# if item.file_name != "OEBPS/ch01.xhtml":
# continue
index = self.process_item(
item, index, p_to_save_len, pbar, new_book, trans_taglist
)
if self.accumulated_num > 1: if self.accumulated_num > 1:
name, _ = os.path.splitext(self.epub_name) name, _ = os.path.splitext(self.epub_name)
epub.write_epub(f"{name}_bilingual.epub", new_book, {}) epub.write_epub(f"{name}_bilingual.epub", new_book, {})