mirror of
https://github.com/yihong0618/bilingual_book_maker.git
synced 2025-06-06 11:35:49 +00:00
support --retranslate
use string for start,end clean autofind filename fix bug, don't use index clean
This commit is contained in:
parent
6f56ac7a25
commit
dfa1b9ada8
@ -44,6 +44,11 @@ The bilingual_book_maker is an AI translation tool that uses ChatGPT to assist u
|
|||||||
output 2200 tokens and maybe 200 tokens for other messages in the system messages user messages, 1600+2200+200=4000, So you are close to reaching the limit. You have to choose your own
|
output 2200 tokens and maybe 200 tokens for other messages in the system messages user messages, 1600+2200+200=4000, So you are close to reaching the limit. You have to choose your own
|
||||||
value, there is no way to know if the limit is reached before sending
|
value, there is no way to know if the limit is reached before sending
|
||||||
- `--translation_style` example: `--translation_style "color: #808080; font-style: italic;"`
|
- `--translation_style` example: `--translation_style "color: #808080; font-style: italic;"`
|
||||||
|
- `--retranslate` `--retranslate "$translated_filepath" "file_name_in_epub" "start_str" "end_str"(optional)`<br>
|
||||||
|
Retranslate from start_str to end_str's tag:
|
||||||
|
`python3 "make_book.py" --book_name "test_books/animal_farm.epub" --retranslate 'test_books/animal_farm_bilingual.epub' 'index_split_002.html' 'in spite of the present book shortage which' 'This kind of thing is not a good symptom. Obviously'`<br>
|
||||||
|
Retranslate start_str's tag:
|
||||||
|
`python3 "make_book.py" --book_name "test_books/animal_farm.epub" --retranslate 'test_books/animal_farm_bilingual.epub' 'index_split_002.html' 'in spite of the present book shortage which'`
|
||||||
|
|
||||||
### Examples
|
### Examples
|
||||||
|
|
||||||
|
@ -194,6 +194,18 @@ So you are close to reaching the limit. You have to choose your own value, there
|
|||||||
type=int,
|
type=int,
|
||||||
help="how many lines will be translated by aggregated translation(This options currently only applies to txt files)",
|
help="how many lines will be translated by aggregated translation(This options currently only applies to txt files)",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--retranslate",
|
||||||
|
dest="retranslate",
|
||||||
|
nargs=4,
|
||||||
|
type=str,
|
||||||
|
help="""--retranslate "$translated_filepath" "file_name_in_epub" "start_str" "end_str"(optional)
|
||||||
|
Retranslate from start_str to end_str's tag:
|
||||||
|
python3 "make_book.py" --book_name "test_books/animal_farm.epub" --retranslate 'test_books/animal_farm_bilingual.epub' 'index_split_002.html' 'in spite of the present book shortage which' 'This kind of thing is not a good symptom. Obviously'
|
||||||
|
Retranslate start_str's tag:
|
||||||
|
python3 "make_book.py" --book_name "test_books/animal_farm.epub" --retranslate 'test_books/animal_farm_bilingual.epub' 'index_split_002.html' 'in spite of the present book shortage which'
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
options = parser.parse_args()
|
options = parser.parse_args()
|
||||||
|
|
||||||
@ -283,6 +295,9 @@ So you are close to reaching the limit. You have to choose your own value, there
|
|||||||
e.translation_style = options.translation_style
|
e.translation_style = options.translation_style
|
||||||
if options.batch_size:
|
if options.batch_size:
|
||||||
e.batch_size = options.batch_size
|
e.batch_size = options.batch_size
|
||||||
|
if options.retranslate:
|
||||||
|
e.retranslate = options.retranslate
|
||||||
|
|
||||||
e.make_bilingual_book()
|
e.make_bilingual_book()
|
||||||
|
|
||||||
|
|
||||||
|
@ -47,6 +47,7 @@ class EPUBBookLoader(BaseBookLoader):
|
|||||||
self.helper = EPUBBookLoaderHelper(
|
self.helper = EPUBBookLoaderHelper(
|
||||||
self.translate_model, self.accumulated_num, self.translation_style
|
self.translate_model, self.accumulated_num, self.translation_style
|
||||||
)
|
)
|
||||||
|
self.retranslate = None
|
||||||
|
|
||||||
# monkey pathch for # 173
|
# monkey pathch for # 173
|
||||||
def _write_items_patch(obj):
|
def _write_items_patch(obj):
|
||||||
@ -168,42 +169,146 @@ class EPUBBookLoader(BaseBookLoader):
|
|||||||
wait_p_list.append(p)
|
wait_p_list.append(p)
|
||||||
count = length
|
count = length
|
||||||
|
|
||||||
def make_bilingual_book(self):
|
def get_item(self, book, name):
|
||||||
self.helper = EPUBBookLoaderHelper(
|
for item in book.get_items():
|
||||||
self.translate_model, self.accumulated_num, self.translation_style
|
if item.file_name == name:
|
||||||
)
|
return item
|
||||||
new_book = self._make_new_book(self.origin_book)
|
|
||||||
all_items = list(self.origin_book.get_items())
|
def find_items_containing_string(self, book, search_string):
|
||||||
trans_taglist = self.translate_tags.split(",")
|
matching_items = []
|
||||||
all_p_length = sum(
|
|
||||||
|
for item in book.get_items_of_type(ITEM_DOCUMENT):
|
||||||
|
content = item.get_content().decode("utf-8")
|
||||||
|
if search_string in content:
|
||||||
|
matching_items.append(item)
|
||||||
|
|
||||||
|
return matching_items
|
||||||
|
|
||||||
|
def retranslate_book(self, index, p_to_save_len, pbar, trans_taglist, retranslate):
|
||||||
|
complete_book_name = retranslate[0]
|
||||||
|
fixname = retranslate[1]
|
||||||
|
fixstart = retranslate[2]
|
||||||
|
fixend = retranslate[3]
|
||||||
|
|
||||||
|
if fixend == "":
|
||||||
|
fixend = fixstart
|
||||||
|
|
||||||
|
name_fix = complete_book_name
|
||||||
|
|
||||||
|
complete_book = epub.read_epub(complete_book_name)
|
||||||
|
|
||||||
|
if fixname == "":
|
||||||
|
fixname = self.find_items_containing_string(complete_book, fixstart)[
|
||||||
0
|
0
|
||||||
if i.get_type() != ITEM_DOCUMENT
|
].file_name
|
||||||
else len(bs(i.content, "html.parser").findAll(trans_taglist))
|
print(f"auto find fixname: {fixname}")
|
||||||
for i in all_items
|
|
||||||
)
|
new_book = self._make_new_book(complete_book)
|
||||||
all_p_length += self.allow_navigable_strings * sum(
|
|
||||||
0
|
complete_item = self.get_item(complete_book, fixname)
|
||||||
if i.get_type() != ITEM_DOCUMENT
|
if complete_item is None:
|
||||||
else len(bs(i.content, "html.parser").findAll(text=True))
|
return
|
||||||
for i in all_items
|
|
||||||
)
|
ori_item = self.get_item(self.origin_book, fixname)
|
||||||
pbar = tqdm(total=self.test_num) if self.is_test else tqdm(total=all_p_length)
|
if ori_item is None:
|
||||||
index = 0
|
return
|
||||||
p_to_save_len = len(self.p_to_save)
|
|
||||||
try:
|
soup_complete = bs(complete_item.content, "html.parser")
|
||||||
# Add the things that don't need to be translated first, so that you can see the img after the interruption
|
soup_ori = bs(ori_item.content, "html.parser")
|
||||||
for item in self.origin_book.get_items():
|
|
||||||
if item.get_type() != ITEM_DOCUMENT:
|
p_list_complete = soup_complete.findAll(trans_taglist)
|
||||||
|
p_list_ori = soup_ori.findAll(trans_taglist)
|
||||||
|
|
||||||
|
target = None
|
||||||
|
tagl = []
|
||||||
|
|
||||||
|
# extract from range
|
||||||
|
find_end = False
|
||||||
|
find_start = False
|
||||||
|
for tag in p_list_complete:
|
||||||
|
if find_end:
|
||||||
|
tagl.append(tag)
|
||||||
|
break
|
||||||
|
|
||||||
|
if fixend in tag.text:
|
||||||
|
find_end = True
|
||||||
|
if fixstart in tag.text:
|
||||||
|
find_start = True
|
||||||
|
|
||||||
|
if find_start:
|
||||||
|
if not target:
|
||||||
|
target = tag.previous_sibling
|
||||||
|
tagl.append(tag)
|
||||||
|
|
||||||
|
for t in tagl:
|
||||||
|
t.extract()
|
||||||
|
|
||||||
|
flag = False
|
||||||
|
extract_p_list_ori = []
|
||||||
|
for p in p_list_ori:
|
||||||
|
if fixstart in p.text:
|
||||||
|
flag = True
|
||||||
|
if flag:
|
||||||
|
extract_p_list_ori.append(p)
|
||||||
|
if fixend in p.text:
|
||||||
|
break
|
||||||
|
|
||||||
|
for t in extract_p_list_ori:
|
||||||
|
target.insert_after(t)
|
||||||
|
target = t
|
||||||
|
|
||||||
|
for item in complete_book.get_items():
|
||||||
|
if item.file_name != fixname:
|
||||||
new_book.add_item(item)
|
new_book.add_item(item)
|
||||||
|
|
||||||
for item in self.origin_book.get_items_of_type(ITEM_DOCUMENT):
|
complete_item.content = soup_complete.prettify().encode()
|
||||||
# if item.file_name != "OEBPS/ch01.xhtml":
|
|
||||||
# continue
|
# =================================================
|
||||||
|
index = self.process_item(
|
||||||
|
complete_item,
|
||||||
|
index,
|
||||||
|
p_to_save_len,
|
||||||
|
pbar,
|
||||||
|
new_book,
|
||||||
|
trans_taglist,
|
||||||
|
fixstart,
|
||||||
|
fixend,
|
||||||
|
)
|
||||||
|
epub.write_epub(f"{name_fix}", new_book, {})
|
||||||
|
|
||||||
|
def process_item(
|
||||||
|
self,
|
||||||
|
item,
|
||||||
|
index,
|
||||||
|
p_to_save_len,
|
||||||
|
pbar,
|
||||||
|
new_book,
|
||||||
|
trans_taglist,
|
||||||
|
fixstart=None,
|
||||||
|
fixend=None,
|
||||||
|
):
|
||||||
if not os.path.exists("log"):
|
if not os.path.exists("log"):
|
||||||
os.makedirs("log")
|
os.makedirs("log")
|
||||||
|
|
||||||
soup = bs(item.content, "html.parser")
|
soup = bs(item.content, "html.parser")
|
||||||
p_list = soup.findAll(trans_taglist)
|
p_list = soup.findAll(trans_taglist)
|
||||||
|
|
||||||
|
if self.retranslate:
|
||||||
|
new_p_list = []
|
||||||
|
|
||||||
|
if fixstart is None or fixend is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
start_append = False
|
||||||
|
for p in p_list:
|
||||||
|
text = p.get_text()
|
||||||
|
if fixstart in text or fixend in text or start_append:
|
||||||
|
start_append = True
|
||||||
|
new_p_list.append(p)
|
||||||
|
if fixend in text:
|
||||||
|
p_list = new_p_list
|
||||||
|
break
|
||||||
|
|
||||||
if self.allow_navigable_strings:
|
if self.allow_navigable_strings:
|
||||||
p_list.extend(soup.findAll(text=True))
|
p_list.extend(soup.findAll(text=True))
|
||||||
|
|
||||||
@ -228,6 +333,49 @@ class EPUBBookLoader(BaseBookLoader):
|
|||||||
|
|
||||||
item.content = soup.prettify().encode()
|
item.content = soup.prettify().encode()
|
||||||
new_book.add_item(item)
|
new_book.add_item(item)
|
||||||
|
|
||||||
|
return index
|
||||||
|
|
||||||
|
def make_bilingual_book(self):
|
||||||
|
self.helper = EPUBBookLoaderHelper(
|
||||||
|
self.translate_model, self.accumulated_num, self.translation_style
|
||||||
|
)
|
||||||
|
new_book = self._make_new_book(self.origin_book)
|
||||||
|
all_items = list(self.origin_book.get_items())
|
||||||
|
trans_taglist = self.translate_tags.split(",")
|
||||||
|
all_p_length = sum(
|
||||||
|
0
|
||||||
|
if i.get_type() != ITEM_DOCUMENT
|
||||||
|
else len(bs(i.content, "html.parser").findAll(trans_taglist))
|
||||||
|
for i in all_items
|
||||||
|
)
|
||||||
|
all_p_length += self.allow_navigable_strings * sum(
|
||||||
|
0
|
||||||
|
if i.get_type() != ITEM_DOCUMENT
|
||||||
|
else len(bs(i.content, "html.parser").findAll(text=True))
|
||||||
|
for i in all_items
|
||||||
|
)
|
||||||
|
pbar = tqdm(total=self.test_num) if self.is_test else tqdm(total=all_p_length)
|
||||||
|
index = 0
|
||||||
|
p_to_save_len = len(self.p_to_save)
|
||||||
|
try:
|
||||||
|
if self.retranslate:
|
||||||
|
self.retranslate_book(
|
||||||
|
index, p_to_save_len, pbar, trans_taglist, self.retranslate
|
||||||
|
)
|
||||||
|
exit(0)
|
||||||
|
# Add the things that don't need to be translated first, so that you can see the img after the interruption
|
||||||
|
for item in self.origin_book.get_items():
|
||||||
|
if item.get_type() != ITEM_DOCUMENT:
|
||||||
|
new_book.add_item(item)
|
||||||
|
|
||||||
|
for item in self.origin_book.get_items_of_type(ITEM_DOCUMENT):
|
||||||
|
# if item.file_name != "OEBPS/ch01.xhtml":
|
||||||
|
# continue
|
||||||
|
index = self.process_item(
|
||||||
|
item, index, p_to_save_len, pbar, new_book, trans_taglist
|
||||||
|
)
|
||||||
|
|
||||||
if self.accumulated_num > 1:
|
if self.accumulated_num > 1:
|
||||||
name, _ = os.path.splitext(self.epub_name)
|
name, _ = os.path.splitext(self.epub_name)
|
||||||
epub.write_epub(f"{name}_bilingual.epub", new_book, {})
|
epub.write_epub(f"{name}_bilingual.epub", new_book, {})
|
||||||
|
Loading…
x
Reference in New Issue
Block a user