Cumulative translation (#148)

This commit is contained in:
hleft 2023-03-16 21:25:47 +08:00 committed by GitHub
parent a661131efb
commit e38a236be6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 420 additions and 76 deletions

1
.gitignore vendored
View File

@ -132,3 +132,4 @@ dmypy.json
.pyre/ .pyre/
/test_books/*.epub /test_books/*.epub
log/

View File

@ -40,6 +40,9 @@ The bilingual_book_maker is an AI translation tool that uses ChatGPT to assist u
If you need to set the `system` role content, you can use the following format: `--prompt '{"user":"Translate {text} to {language}", "system": "You are a professional translator."}'` or `--prompt prompt_template_sample.json` (example of a JSON file can be found at [./prompt_template_sample.json](./prompt_template_sample.json)). If you need to set the `system` role content, you can use the following format: `--prompt '{"user":"Translate {text} to {language}", "system": "You are a professional translator."}'` or `--prompt prompt_template_sample.json` (example of a JSON file can be found at [./prompt_template_sample.json](./prompt_template_sample.json)).
You can also set the `user` and `system` role prompt by setting environment variables: `BBM_CHATGPTAPI_USER_MSG_TEMPLATE` and `BBM_CHATGPTAPI_SYS_MSG`. You can also set the `user` and `system` role prompt by setting environment variables: `BBM_CHATGPTAPI_USER_MSG_TEMPLATE` and `BBM_CHATGPTAPI_SYS_MSG`.
- Use the `--batch_size` parameter to specify the number of lines for batch translation (default is 10, currently only effective for txt files). - Use the `--batch_size` parameter to specify the number of lines for batch translation (default is 10, currently only effective for txt files).
- `--accumulated_num` Wait for how many tokens have been accumulated before starting the translation. gpt3.5 limits the total_token to 4090. For example, if you use --accumulated_num 1600, maybe openai will
output 2200 tokens and maybe 200 tokens for other messages in the system messages user messages, 1600+2200+200=4000, So you are close to reaching the limit. You have to choose your own
value, there is no way to know if the limit is reached before sending
### Examples ### Examples

View File

@ -170,6 +170,18 @@ def main():
metavar="PROMPT_ARG", metavar="PROMPT_ARG",
help="used for customizing the prompt. It can be the prompt template string, or a path to the template file. The valid placeholders are `{text}` and `{language}`.", help="used for customizing the prompt. It can be the prompt template string, or a path to the template file. The valid placeholders are `{text}` and `{language}`.",
) )
parser.add_argument(
"--accumulated_num",
dest="accumulated_num",
type=int,
default=1,
help="""Wait for how many tokens have been accumulated before starting the translation.
gpt3.5 limits the total_token to 4090.
For example, if you use --accumulated_num 1600, maybe openai will output 2200 tokens
and maybe 200 tokens for other messages in the system messages user messages, 1600+2200+200=4000,
So you are close to reaching the limit. You have to choose your own value, there is no way to know if the limit is reached before sending
""",
)
parser.add_argument( parser.add_argument(
"--batch_size", "--batch_size",
dest="batch_size", dest="batch_size",
@ -250,6 +262,7 @@ def main():
test_num=options.test_num, test_num=options.test_num,
translate_tags=options.translate_tags, translate_tags=options.translate_tags,
allow_navigable_strings=options.allow_navigable_strings, allow_navigable_strings=options.allow_navigable_strings,
accumulated_num=options.accumulated_num,
prompt_config=parse_prompt_arg(options.prompt_arg), prompt_config=parse_prompt_arg(options.prompt_arg),
batch_size=options.batch_size, batch_size=options.batch_size,
) )

View File

@ -1,5 +1,7 @@
import os import os
import re
import pickle import pickle
import tiktoken
import sys import sys
from copy import copy from copy import copy
from pathlib import Path from pathlib import Path
@ -15,6 +17,95 @@ from book_maker.utils import prompt_config_to_kwargs
from .base_loader import BaseBookLoader from .base_loader import BaseBookLoader
class EPUBBookLoaderHelper:
def __init__(self, translate_model, accumulated_num):
self.translate_model = translate_model
self.accumulated_num = accumulated_num
def deal_new(self, p, wait_p_list):
self.deal_old(wait_p_list)
new_p = copy(p)
new_p.string = self.translate_model.translate(p.text)
p.insert_after(new_p)
def deal_old(self, wait_p_list):
if len(wait_p_list) == 0:
return
result_txt_list = self.translate_model.translate_list(wait_p_list)
for i in range(len(wait_p_list)):
if i < len(result_txt_list):
p = wait_p_list[i]
new_p = copy(p)
new_p.string = result_txt_list[i]
p.insert_after(new_p)
wait_p_list.clear()
# ref: https://platform.openai.com/docs/guides/chat/introduction
def num_tokens_from_text(text, model="gpt-3.5-turbo-0301"):
messages = (
{
"role": "user",
"content": text,
},
)
"""Returns the number of tokens used by a list of messages."""
try:
encoding = tiktoken.encoding_for_model(model)
except KeyError:
encoding = tiktoken.get_encoding("cl100k_base")
if model == "gpt-3.5-turbo-0301": # note: future models may deviate from this
num_tokens = 0
for message in messages:
num_tokens += (
4 # every message follows <im_start>{role/name}\n{content}<im_end>\n
)
for key, value in message.items():
num_tokens += len(encoding.encode(value))
if key == "name": # if there's a name, the role is omitted
num_tokens += -1 # role is always required and always 1 token
num_tokens += 2 # every reply is primed with <im_start>assistant
return num_tokens
else:
raise NotImplementedError(
f"""num_tokens_from_messages() is not presently implemented for model {model}.
See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
)
def is_link(text):
url_pattern = re.compile(
r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
)
return bool(url_pattern.match(text.strip()))
def is_tail_Link(text, num=100):
text = text.strip()
url_pattern = re.compile(
r".*http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+$"
)
return bool(url_pattern.match(text)) and len(text) < num
def is_source(text):
return text.strip().startswith("Source: ")
def is_list(text, num=80):
text = text.strip()
return re.match(r"^Listing\s*\d+", text) and len(text) < num
def is_figure(text, num=80):
text = text.strip()
return re.match(r"^Figure\s*\d+", text) and len(text) < num
class EPUBBookLoader(BaseBookLoader): class EPUBBookLoader(BaseBookLoader):
def __init__( def __init__(
self, self,
@ -29,6 +120,8 @@ class EPUBBookLoader(BaseBookLoader):
test_num=5, test_num=5,
translate_tags="p", translate_tags="p",
allow_navigable_strings=False, allow_navigable_strings=False,
accumulated_num=1,
prompt_template=None,
prompt_config=None, prompt_config=None,
): ):
self.epub_name = epub_name self.epub_name = epub_name
@ -43,6 +136,8 @@ class EPUBBookLoader(BaseBookLoader):
self.test_num = test_num self.test_num = test_num
self.translate_tags = translate_tags self.translate_tags = translate_tags
self.allow_navigable_strings = allow_navigable_strings self.allow_navigable_strings = allow_navigable_strings
self.accumulated_num = accumulated_num
self.helper = EPUBBookLoaderHelper(self.translate_model, self.accumulated_num)
try: try:
self.origin_book = epub.read_epub(self.epub_name) self.origin_book = epub.read_epub(self.epub_name)
@ -70,7 +165,7 @@ class EPUBBookLoader(BaseBookLoader):
@staticmethod @staticmethod
def _is_special_text(text): def _is_special_text(text):
return text.isdigit() or text.isspace() return text.isdigit() or text.isspace() or is_link(text)
def _make_new_book(self, book): def _make_new_book(self, book):
new_book = epub.EpubBook() new_book = epub.EpubBook()
@ -79,6 +174,70 @@ class EPUBBookLoader(BaseBookLoader):
new_book.toc = book.toc new_book.toc = book.toc
return new_book return new_book
def _process_paragraph(self, p, index, p_to_save_len):
if not p.text or self._is_special_text(p.text):
return index
new_p = copy(p)
if self.resume and index < p_to_save_len:
new_p.string = self.p_to_save[index]
else:
if type(p) == NavigableString:
new_p = self.translate_model.translate(p.text)
self.p_to_save.append(new_p)
else:
new_p.string = self.translate_model.translate(p.text)
self.p_to_save.append(new_p.text)
p.insert_after(new_p)
index += 1
if index % 20 == 0:
self._save_progress()
return index
def translate_paragraphs_acc(self, p_list, send_num):
count = 0
wait_p_list = []
for i in range(len(p_list)):
p = p_list[i]
temp_p = copy(p)
for sup in temp_p.find_all("sup"):
sup.extract()
if (
not p.text
or self._is_special_text(temp_p.text)
or is_source(temp_p.text)
or is_list(temp_p.text)
or is_figure(temp_p.text)
or is_tail_Link(temp_p.text)
):
continue
length = num_tokens_from_text(temp_p.text)
if length > send_num:
self.helper.deal_new(p, wait_p_list)
continue
if i == len(p_list) - 1:
if count + length < send_num:
wait_p_list.append(p)
self.helper.deal_old(wait_p_list)
else:
self.helper.deal_new(p, wait_p_list)
break
if count + length < send_num:
count += length
wait_p_list.append(p)
# This is because the more paragraphs, the easier it is possible to translate different numbers of paragraphs, maybe you should find better values than 15 and 2
# if len(wait_p_list) > 15 and count > send_num / 2:
# self.helper.deal_old(wait_p_list)
# count = 0
else:
self.helper.deal_old(wait_p_list)
wait_p_list.append(p)
count = length
def make_bilingual_book(self): def make_bilingual_book(self):
new_book = self._make_new_book(self.origin_book) new_book = self._make_new_book(self.origin_book)
all_items = list(self.origin_book.get_items()) all_items = list(self.origin_book.get_items())
@ -99,43 +258,53 @@ class EPUBBookLoader(BaseBookLoader):
index = 0 index = 0
p_to_save_len = len(self.p_to_save) p_to_save_len = len(self.p_to_save)
try: try:
# Add the things that don't need to be translated first, so that you can see the img after the interruption
for item in self.origin_book.get_items(): for item in self.origin_book.get_items():
if item.get_type() == ITEM_DOCUMENT: if item.get_type() != ITEM_DOCUMENT:
new_book.add_item(item)
for item in self.origin_book.get_items_of_type(ITEM_DOCUMENT):
# if item.file_name != "OEBPS/ch01.xhtml":
# continue
if not os.path.exists("log"):
os.makedirs("log")
soup = bs(item.content, "html.parser") soup = bs(item.content, "html.parser")
p_list = soup.findAll(trans_taglist) p_list = soup.findAll(trans_taglist)
if self.allow_navigable_strings: if self.allow_navigable_strings:
p_list.extend(soup.findAll(text=True)) p_list.extend(soup.findAll(text=True))
send_num = self.accumulated_num
if send_num > 1:
with open("log/buglog.txt", "a") as f:
print(f"------------- {item.file_name} -------------", file=f)
print("------------------------------------------------------")
print(f"dealing {item.file_name} ...")
self.translate_paragraphs_acc(p_list, send_num)
else:
is_test_done = self.is_test and index > self.test_num is_test_done = self.is_test and index > self.test_num
for p in p_list: for p in p_list:
if is_test_done or not p.text or self._is_special_text(p.text): if is_test_done:
continue break
new_p = copy(p) index = self._process_paragraph(p, index, p_to_save_len)
# TODO banch of p to translate then combine
# PR welcome here
if self.resume and index < p_to_save_len:
new_p.string = self.p_to_save[index]
else:
if type(p) == NavigableString:
new_p = self.translate_model.translate(p.text)
self.p_to_save.append(new_p)
else:
new_p.string = self.translate_model.translate(p.text)
self.p_to_save.append(new_p.text)
p.insert_after(new_p)
index += 1
if index % 20 == 0:
self._save_progress()
# pbar.update(delta) not pbar.update(index)? # pbar.update(delta) not pbar.update(index)?
pbar.update(1) pbar.update(1)
if self.is_test and index >= self.test_num: if self.is_test and index >= self.test_num:
break break
item.content = soup.prettify().encode() item.content = soup.prettify().encode()
new_book.add_item(item) new_book.add_item(item)
if self.accumulated_num > 1:
name, _ = os.path.splitext(self.epub_name) name, _ = os.path.splitext(self.epub_name)
epub.write_epub(f"{name}_bilingual.epub", new_book, {}) epub.write_epub(f"{name}_bilingual.epub", new_book, {})
name, _ = os.path.splitext(self.epub_name)
epub.write_epub(f"{name}_bilingual.epub", new_book, {})
if self.accumulated_num == 1:
pbar.close() pbar.close()
except (KeyboardInterrupt, Exception) as e: except (KeyboardInterrupt, Exception) as e:
print(e) print(e)
if self.accumulated_num == 1:
print("you can resume it next time") print("you can resume it next time")
self._save_progress() self._save_progress()
self._save_temp_book() self._save_temp_book()

View File

@ -20,6 +20,8 @@ class TXTBookLoader(BaseBookLoader):
model_api_base=None, model_api_base=None,
is_test=False, is_test=False,
test_num=5, test_num=5,
accumulated_num=1,
prompt_template=None,
prompt_config=None, prompt_config=None,
): ):
self.txt_name = txt_name self.txt_name = txt_name
@ -102,7 +104,7 @@ class TXTBookLoader(BaseBookLoader):
for i in range(0, len(self.origin_book), self.batch_size) for i in range(0, len(self.origin_book), self.batch_size)
] ]
for i in range(0, len(sliced_list)): for i in range(len(sliced_list)):
batch_text = "".join(sliced_list[i]) batch_text = "".join(sliced_list[i])
self.bilingual_temp_result.append(batch_text) self.bilingual_temp_result.append(batch_text)
if self._is_special_text(self.origin_book[i]): if self._is_special_text(self.origin_book[i]):

View File

@ -1,4 +1,6 @@
import time import time
import re
from copy import copy
from os import environ from os import environ
import openai import openai
@ -38,46 +40,85 @@ class ChatGPTAPI(Base):
"OPENAI_API_SYS_MSG" "OPENAI_API_SYS_MSG"
) # XXX: for backward compatability, deprecate soon ) # XXX: for backward compatability, deprecate soon
or environ.get(PROMPT_ENV_MAP["system"]) or environ.get(PROMPT_ENV_MAP["system"])
or ""
) )
self.system_content = environ.get("OPENAI_API_SYS_MSG") or ""
max_num_token = -1
def rotate_key(self): def rotate_key(self):
openai.api_key = next(self.keys) openai.api_key = next(self.keys)
def get_translation(self, text): def create_chat_completion(self, text):
self.rotate_key() content = self.prompt_template.format(text=text, language=self.language)
messages = [] sys_content = self.prompt_sys_msg
if self.prompt_sys_msg: if self.system_content:
messages.append( sys_content = self.system_content
{"role": "system", "content": self.prompt_sys_msg}, messages = [
) {"role": "system", "content": sys_content},
messages.append( {"role": "user", "content": content},
{ ]
"role": "user",
"content": self.prompt_template.format(
text=text, language=self.language
),
}
)
completion = openai.ChatCompletion.create( return openai.ChatCompletion.create(
model="gpt-3.5-turbo", model="gpt-3.5-turbo",
messages=messages, messages=messages,
) )
t_text = (
completion["choices"][0] def get_translation(self, text):
.get("message") self.rotate_key()
.get("content")
.encode("utf8") completion = {}
.decode() try:
completion = self.create_chat_completion(text)
except Exception:
if (
not "choices" in completion
or not isinstance(completion["choices"], list)
or len(completion["choices"]) == 0
):
raise
if completion["choices"][0]["finish_reason"] != "length":
raise
# work well or exception finish by length limit
choice = completion["choices"][0]
t_text = choice.get("message").get("content").encode("utf8").decode()
if choice["finish_reason"] == "length":
with open("long_text.txt", "a") as f:
print(
f"""==================================================
The total token is too long and cannot be completely translated\n
{text}
""",
file=f,
) )
# usage = completion["usage"]
# print(f"total_token: {usage['total_tokens']}")
# if int(usage["total_tokens"]) > self.max_num_token:
# self.max_num_token = int(usage["total_tokens"])
# print(
# f"{usage['total_tokens']} {usage['prompt_tokens']} {usage['completion_tokens']} {self.max_num_token} (total_token, prompt_token, completion_tokens, max_history_total_token)"
# )
return t_text return t_text
def translate(self, text): def translate(self, text, needprint=True):
# print("=================================================")
start_time = time.time()
# todo: Determine whether to print according to the cli option # todo: Determine whether to print according to the cli option
print(text) if needprint:
print(re.sub("\n{3,}", "\n\n", text))
attempt_count = 0
max_attempts = 3
t_text = ""
while attempt_count < max_attempts:
try: try:
t_text = self.get_translation(text) t_text = self.get_translation(text)
break
except Exception as e: except Exception as e:
# todo: better sleep time? why sleep alawys about key_len # todo: better sleep time? why sleep alawys about key_len
# 1. openai server error or own network interruption, sleep for a fixed time # 1. openai server error or own network interruption, sleep for a fixed time
@ -86,9 +127,123 @@ class ChatGPTAPI(Base):
sleep_time = int(60 / self.key_len) sleep_time = int(60 / self.key_len)
print(e, f"will sleep {sleep_time} seconds") print(e, f"will sleep {sleep_time} seconds")
time.sleep(sleep_time) time.sleep(sleep_time)
attempt_count += 1
t_text = self.get_translation(text) if attempt_count == max_attempts:
print(f"Get {attempt_count} consecutive exceptions")
raise
# todo: Determine whether to print according to the cli option # todo: Determine whether to print according to the cli option
print(t_text.strip()) if needprint:
print(re.sub("\n{3,}", "\n\n", t_text))
elapsed_time = time.time() - start_time
# print(f"translation time: {elapsed_time:.1f}s")
return t_text return t_text
def translate_and_split_lines(self, text):
result_str = self.translate(text, False)
lines = result_str.split("\n")
lines = [line.strip() for line in lines if line.strip() != ""]
return lines
def get_best_result_list(
self, plist_len, new_str, sleep_dur, result_list, max_retries=15
):
if len(result_list) == plist_len:
return result_list, 0
best_result_list = result_list
retry_count = 0
while retry_count < max_retries and len(result_list) != plist_len:
print(
f"bug: {plist_len} -> {len(result_list)} : Number of paragraphs before and after translation"
)
print(f"sleep for {sleep_dur}s and retry {retry_count+1} ...")
time.sleep(sleep_dur)
retry_count += 1
result_list = self.translate_and_split_lines(new_str)
if (
len(result_list) == plist_len
or len(best_result_list) < len(result_list) <= plist_len
or (
len(result_list) < len(best_result_list)
and len(best_result_list) > plist_len
)
):
best_result_list = result_list
return best_result_list, retry_count
def log_retry(self, state, retry_count, elapsed_time, log_path="log/buglog.txt"):
if retry_count == 0:
return
print(f"retry {state}")
with open(log_path, "a") as f:
print(
f"retry {state}, count = {retry_count}, time = {elapsed_time:.1f}s",
file=f,
)
def log_translation_mismatch(
self, plist_len, result_list, new_str, sep, log_path="log/buglog.txt"
):
if len(result_list) == plist_len:
return
newlist = new_str.split(sep)
with open(log_path, "a") as f:
print(f"problem size: {plist_len - len(result_list)}", file=f)
for i in range(len(newlist)):
print(newlist[i], file=f)
print(file=f)
if i < len(result_list):
print(result_list[i], file=f)
print(file=f)
print("=============================", file=f)
print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
print(
f"bug: {plist_len} paragraphs of text translated into {len(result_list)} paragraphs"
)
print("continue")
def translate_list(self, plist):
sep = "\n\n\n\n\n"
# new_str = sep.join([item.text for item in plist])
new_str = ""
i = 1
for p in plist:
temp_p = copy(p)
for sup in temp_p.find_all("sup"):
sup.extract()
new_str += f"({i}) " + temp_p.get_text().strip() + sep
i = i + 1
if new_str.endswith(sep):
new_str = new_str[: -len(sep)]
plist_len = len(plist)
print(f"plist len = {len(plist)}")
result_list = self.translate_and_split_lines(new_str)
start_time = time.time()
result_list, retry_count = self.get_best_result_list(
plist_len, new_str, 6, result_list
)
end_time = time.time()
state = "fail" if len(result_list) != plist_len else "success"
log_path = "log/buglog.txt"
self.log_retry(state, retry_count, end_time - start_time, log_path)
self.log_translation_mismatch(plist_len, result_list, new_str, sep, log_path)
# del (num), num. sometime (num) will translated to num.
result_list = [re.sub(r"^(\(\d+\)|\d+\.)\s*", "", s) for s in result_list]
return result_list

View File

@ -4,3 +4,4 @@ requests
ebooklib ebooklib
rich rich
tqdm tqdm
tiktoken