mirror of
https://github.com/yihong0618/bilingual_book_maker.git
synced 2025-06-05 19:15:34 +00:00
support --retranslate
use string for start,end clean autofind filename fix bug, don't use index clean
This commit is contained in:
parent
6f56ac7a25
commit
dfa1b9ada8
@ -44,6 +44,11 @@ The bilingual_book_maker is an AI translation tool that uses ChatGPT to assist u
|
||||
output 2200 tokens and maybe 200 tokens for other messages in the system messages user messages, 1600+2200+200=4000, So you are close to reaching the limit. You have to choose your own
|
||||
value, there is no way to know if the limit is reached before sending
|
||||
- `--translation_style` example: `--translation_style "color: #808080; font-style: italic;"`
|
||||
- `--retranslate` `--retranslate "$translated_filepath" "file_name_in_epub" "start_str" "end_str"(optional)`<br>
|
||||
Retranslate from start_str to end_str's tag:
|
||||
`python3 "make_book.py" --book_name "test_books/animal_farm.epub" --retranslate 'test_books/animal_farm_bilingual.epub' 'index_split_002.html' 'in spite of the present book shortage which' 'This kind of thing is not a good symptom. Obviously'`<br>
|
||||
Retranslate start_str's tag:
|
||||
`python3 "make_book.py" --book_name "test_books/animal_farm.epub" --retranslate 'test_books/animal_farm_bilingual.epub' 'index_split_002.html' 'in spite of the present book shortage which'`
|
||||
|
||||
### Examples
|
||||
|
||||
|
@ -194,6 +194,18 @@ So you are close to reaching the limit. You have to choose your own value, there
|
||||
type=int,
|
||||
help="how many lines will be translated by aggregated translation(This options currently only applies to txt files)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--retranslate",
|
||||
dest="retranslate",
|
||||
nargs=4,
|
||||
type=str,
|
||||
help="""--retranslate "$translated_filepath" "file_name_in_epub" "start_str" "end_str"(optional)
|
||||
Retranslate from start_str to end_str's tag:
|
||||
python3 "make_book.py" --book_name "test_books/animal_farm.epub" --retranslate 'test_books/animal_farm_bilingual.epub' 'index_split_002.html' 'in spite of the present book shortage which' 'This kind of thing is not a good symptom. Obviously'
|
||||
Retranslate start_str's tag:
|
||||
python3 "make_book.py" --book_name "test_books/animal_farm.epub" --retranslate 'test_books/animal_farm_bilingual.epub' 'index_split_002.html' 'in spite of the present book shortage which'
|
||||
""",
|
||||
)
|
||||
|
||||
options = parser.parse_args()
|
||||
|
||||
@ -283,6 +295,9 @@ So you are close to reaching the limit. You have to choose your own value, there
|
||||
e.translation_style = options.translation_style
|
||||
if options.batch_size:
|
||||
e.batch_size = options.batch_size
|
||||
if options.retranslate:
|
||||
e.retranslate = options.retranslate
|
||||
|
||||
e.make_bilingual_book()
|
||||
|
||||
|
||||
|
@ -47,6 +47,7 @@ class EPUBBookLoader(BaseBookLoader):
|
||||
self.helper = EPUBBookLoaderHelper(
|
||||
self.translate_model, self.accumulated_num, self.translation_style
|
||||
)
|
||||
self.retranslate = None
|
||||
|
||||
# monkey pathch for # 173
|
||||
def _write_items_patch(obj):
|
||||
@ -168,42 +169,146 @@ class EPUBBookLoader(BaseBookLoader):
|
||||
wait_p_list.append(p)
|
||||
count = length
|
||||
|
||||
def make_bilingual_book(self):
|
||||
self.helper = EPUBBookLoaderHelper(
|
||||
self.translate_model, self.accumulated_num, self.translation_style
|
||||
)
|
||||
new_book = self._make_new_book(self.origin_book)
|
||||
all_items = list(self.origin_book.get_items())
|
||||
trans_taglist = self.translate_tags.split(",")
|
||||
all_p_length = sum(
|
||||
def get_item(self, book, name):
|
||||
for item in book.get_items():
|
||||
if item.file_name == name:
|
||||
return item
|
||||
|
||||
def find_items_containing_string(self, book, search_string):
|
||||
matching_items = []
|
||||
|
||||
for item in book.get_items_of_type(ITEM_DOCUMENT):
|
||||
content = item.get_content().decode("utf-8")
|
||||
if search_string in content:
|
||||
matching_items.append(item)
|
||||
|
||||
return matching_items
|
||||
|
||||
def retranslate_book(self, index, p_to_save_len, pbar, trans_taglist, retranslate):
|
||||
complete_book_name = retranslate[0]
|
||||
fixname = retranslate[1]
|
||||
fixstart = retranslate[2]
|
||||
fixend = retranslate[3]
|
||||
|
||||
if fixend == "":
|
||||
fixend = fixstart
|
||||
|
||||
name_fix = complete_book_name
|
||||
|
||||
complete_book = epub.read_epub(complete_book_name)
|
||||
|
||||
if fixname == "":
|
||||
fixname = self.find_items_containing_string(complete_book, fixstart)[
|
||||
0
|
||||
if i.get_type() != ITEM_DOCUMENT
|
||||
else len(bs(i.content, "html.parser").findAll(trans_taglist))
|
||||
for i in all_items
|
||||
)
|
||||
all_p_length += self.allow_navigable_strings * sum(
|
||||
0
|
||||
if i.get_type() != ITEM_DOCUMENT
|
||||
else len(bs(i.content, "html.parser").findAll(text=True))
|
||||
for i in all_items
|
||||
)
|
||||
pbar = tqdm(total=self.test_num) if self.is_test else tqdm(total=all_p_length)
|
||||
index = 0
|
||||
p_to_save_len = len(self.p_to_save)
|
||||
try:
|
||||
# Add the things that don't need to be translated first, so that you can see the img after the interruption
|
||||
for item in self.origin_book.get_items():
|
||||
if item.get_type() != ITEM_DOCUMENT:
|
||||
].file_name
|
||||
print(f"auto find fixname: {fixname}")
|
||||
|
||||
new_book = self._make_new_book(complete_book)
|
||||
|
||||
complete_item = self.get_item(complete_book, fixname)
|
||||
if complete_item is None:
|
||||
return
|
||||
|
||||
ori_item = self.get_item(self.origin_book, fixname)
|
||||
if ori_item is None:
|
||||
return
|
||||
|
||||
soup_complete = bs(complete_item.content, "html.parser")
|
||||
soup_ori = bs(ori_item.content, "html.parser")
|
||||
|
||||
p_list_complete = soup_complete.findAll(trans_taglist)
|
||||
p_list_ori = soup_ori.findAll(trans_taglist)
|
||||
|
||||
target = None
|
||||
tagl = []
|
||||
|
||||
# extract from range
|
||||
find_end = False
|
||||
find_start = False
|
||||
for tag in p_list_complete:
|
||||
if find_end:
|
||||
tagl.append(tag)
|
||||
break
|
||||
|
||||
if fixend in tag.text:
|
||||
find_end = True
|
||||
if fixstart in tag.text:
|
||||
find_start = True
|
||||
|
||||
if find_start:
|
||||
if not target:
|
||||
target = tag.previous_sibling
|
||||
tagl.append(tag)
|
||||
|
||||
for t in tagl:
|
||||
t.extract()
|
||||
|
||||
flag = False
|
||||
extract_p_list_ori = []
|
||||
for p in p_list_ori:
|
||||
if fixstart in p.text:
|
||||
flag = True
|
||||
if flag:
|
||||
extract_p_list_ori.append(p)
|
||||
if fixend in p.text:
|
||||
break
|
||||
|
||||
for t in extract_p_list_ori:
|
||||
target.insert_after(t)
|
||||
target = t
|
||||
|
||||
for item in complete_book.get_items():
|
||||
if item.file_name != fixname:
|
||||
new_book.add_item(item)
|
||||
|
||||
for item in self.origin_book.get_items_of_type(ITEM_DOCUMENT):
|
||||
# if item.file_name != "OEBPS/ch01.xhtml":
|
||||
# continue
|
||||
complete_item.content = soup_complete.prettify().encode()
|
||||
|
||||
# =================================================
|
||||
index = self.process_item(
|
||||
complete_item,
|
||||
index,
|
||||
p_to_save_len,
|
||||
pbar,
|
||||
new_book,
|
||||
trans_taglist,
|
||||
fixstart,
|
||||
fixend,
|
||||
)
|
||||
epub.write_epub(f"{name_fix}", new_book, {})
|
||||
|
||||
def process_item(
|
||||
self,
|
||||
item,
|
||||
index,
|
||||
p_to_save_len,
|
||||
pbar,
|
||||
new_book,
|
||||
trans_taglist,
|
||||
fixstart=None,
|
||||
fixend=None,
|
||||
):
|
||||
if not os.path.exists("log"):
|
||||
os.makedirs("log")
|
||||
|
||||
soup = bs(item.content, "html.parser")
|
||||
p_list = soup.findAll(trans_taglist)
|
||||
|
||||
if self.retranslate:
|
||||
new_p_list = []
|
||||
|
||||
if fixstart is None or fixend is None:
|
||||
return
|
||||
|
||||
start_append = False
|
||||
for p in p_list:
|
||||
text = p.get_text()
|
||||
if fixstart in text or fixend in text or start_append:
|
||||
start_append = True
|
||||
new_p_list.append(p)
|
||||
if fixend in text:
|
||||
p_list = new_p_list
|
||||
break
|
||||
|
||||
if self.allow_navigable_strings:
|
||||
p_list.extend(soup.findAll(text=True))
|
||||
|
||||
@ -228,6 +333,49 @@ class EPUBBookLoader(BaseBookLoader):
|
||||
|
||||
item.content = soup.prettify().encode()
|
||||
new_book.add_item(item)
|
||||
|
||||
return index
|
||||
|
||||
def make_bilingual_book(self):
|
||||
self.helper = EPUBBookLoaderHelper(
|
||||
self.translate_model, self.accumulated_num, self.translation_style
|
||||
)
|
||||
new_book = self._make_new_book(self.origin_book)
|
||||
all_items = list(self.origin_book.get_items())
|
||||
trans_taglist = self.translate_tags.split(",")
|
||||
all_p_length = sum(
|
||||
0
|
||||
if i.get_type() != ITEM_DOCUMENT
|
||||
else len(bs(i.content, "html.parser").findAll(trans_taglist))
|
||||
for i in all_items
|
||||
)
|
||||
all_p_length += self.allow_navigable_strings * sum(
|
||||
0
|
||||
if i.get_type() != ITEM_DOCUMENT
|
||||
else len(bs(i.content, "html.parser").findAll(text=True))
|
||||
for i in all_items
|
||||
)
|
||||
pbar = tqdm(total=self.test_num) if self.is_test else tqdm(total=all_p_length)
|
||||
index = 0
|
||||
p_to_save_len = len(self.p_to_save)
|
||||
try:
|
||||
if self.retranslate:
|
||||
self.retranslate_book(
|
||||
index, p_to_save_len, pbar, trans_taglist, self.retranslate
|
||||
)
|
||||
exit(0)
|
||||
# Add the things that don't need to be translated first, so that you can see the img after the interruption
|
||||
for item in self.origin_book.get_items():
|
||||
if item.get_type() != ITEM_DOCUMENT:
|
||||
new_book.add_item(item)
|
||||
|
||||
for item in self.origin_book.get_items_of_type(ITEM_DOCUMENT):
|
||||
# if item.file_name != "OEBPS/ch01.xhtml":
|
||||
# continue
|
||||
index = self.process_item(
|
||||
item, index, p_to_save_len, pbar, new_book, trans_taglist
|
||||
)
|
||||
|
||||
if self.accumulated_num > 1:
|
||||
name, _ = os.path.splitext(self.epub_name)
|
||||
epub.write_epub(f"{name}_bilingual.epub", new_book, {})
|
||||
|
Loading…
x
Reference in New Issue
Block a user