diff --git a/book_maker/cli.py b/book_maker/cli.py index 66c1f5c..4d38475 100644 --- a/book_maker/cli.py +++ b/book_maker/cli.py @@ -186,7 +186,6 @@ So you are close to reaching the limit. You have to choose your own value, there "--batch_size", dest="batch_size", type=int, - default=10, help="how many lines will be translated by aggregated translation(This options currently only applies to txt files)", ) @@ -260,12 +259,17 @@ So you are close to reaching the limit. You have to choose your own value, there model_api_base=model_api_base, is_test=options.test, test_num=options.test_num, - translate_tags=options.translate_tags, - allow_navigable_strings=options.allow_navigable_strings, - accumulated_num=options.accumulated_num, prompt_config=parse_prompt_arg(options.prompt_arg), - batch_size=options.batch_size, ) + # other options + if options.allow_navigable_strings: + e.allow_navigable_strings = True + if options.translate_tags: + e.translate_tags = options.translate_tags + if options.accumulated_num > 1: + e.accumulated_num = options.accumulated_num + if options.batch_size: + e.batch_size = options.batch_size e.make_bilingual_book() diff --git a/book_maker/loader/epub_loader.py b/book_maker/loader/epub_loader.py index 8a1801c..fc7fb53 100644 --- a/book_maker/loader/epub_loader.py +++ b/book_maker/loader/epub_loader.py @@ -1,7 +1,6 @@ import os -import re import pickle -import tiktoken +import string import sys from copy import copy from pathlib import Path @@ -12,98 +11,17 @@ from ebooklib import ITEM_DOCUMENT, epub from rich import print from tqdm import tqdm -from book_maker.utils import prompt_config_to_kwargs +from book_maker.utils import num_tokens_from_text, prompt_config_to_kwargs from .base_loader import BaseBookLoader - - -class EPUBBookLoaderHelper: - def __init__(self, translate_model, accumulated_num): - self.translate_model = translate_model - self.accumulated_num = accumulated_num - - def deal_new(self, p, wait_p_list): - self.deal_old(wait_p_list) - new_p = copy(p) - new_p.string = self.translate_model.translate(p.text) - p.insert_after(new_p) - - def deal_old(self, wait_p_list): - if len(wait_p_list) == 0: - return - - result_txt_list = self.translate_model.translate_list(wait_p_list) - - for i in range(len(wait_p_list)): - if i < len(result_txt_list): - p = wait_p_list[i] - new_p = copy(p) - new_p.string = result_txt_list[i] - p.insert_after(new_p) - - wait_p_list.clear() - - -# ref: https://platform.openai.com/docs/guides/chat/introduction -def num_tokens_from_text(text, model="gpt-3.5-turbo-0301"): - messages = ( - { - "role": "user", - "content": text, - }, - ) - - """Returns the number of tokens used by a list of messages.""" - try: - encoding = tiktoken.encoding_for_model(model) - except KeyError: - encoding = tiktoken.get_encoding("cl100k_base") - if model == "gpt-3.5-turbo-0301": # note: future models may deviate from this - num_tokens = 0 - for message in messages: - num_tokens += ( - 4 # every message follows {role/name}\n{content}\n - ) - for key, value in message.items(): - num_tokens += len(encoding.encode(value)) - if key == "name": # if there's a name, the role is omitted - num_tokens += -1 # role is always required and always 1 token - num_tokens += 2 # every reply is primed with assistant - return num_tokens - else: - raise NotImplementedError( - f"""num_tokens_from_messages() is not presently implemented for model {model}. - See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""" - ) - - -def is_link(text): - url_pattern = re.compile( - r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+" - ) - return bool(url_pattern.match(text.strip())) - - -def is_tail_Link(text, num=100): - text = text.strip() - url_pattern = re.compile( - r".*http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+$" - ) - return bool(url_pattern.match(text)) and len(text) < num - - -def is_source(text): - return text.strip().startswith("Source: ") - - -def is_list(text, num=80): - text = text.strip() - return re.match(r"^Listing\s*\d+", text) and len(text) < num - - -def is_figure(text, num=80): - text = text.strip() - return re.match(r"^Figure\s*\d+", text) and len(text) < num +from .helper import ( + EPUBBookLoaderHelper, + is_text_figure, + is_text_link, + is_text_list, + is_text_source, + is_text_tail_link, +) class EPUBBookLoader(BaseBookLoader): @@ -114,14 +32,9 @@ class EPUBBookLoader(BaseBookLoader): key, resume, language, - batch_size, model_api_base=None, is_test=False, test_num=5, - translate_tags="p", - allow_navigable_strings=False, - accumulated_num=1, - prompt_template=None, prompt_config=None, ): self.epub_name = epub_name @@ -134,9 +47,9 @@ class EPUBBookLoader(BaseBookLoader): ) self.is_test = is_test self.test_num = test_num - self.translate_tags = translate_tags - self.allow_navigable_strings = allow_navigable_strings - self.accumulated_num = accumulated_num + self.translate_tags = "p" + self.allow_navigable_strings = False + self.accumulated_num = 1 self.helper = EPUBBookLoaderHelper(self.translate_model, self.accumulated_num) try: @@ -165,7 +78,12 @@ class EPUBBookLoader(BaseBookLoader): @staticmethod def _is_special_text(text): - return text.isdigit() or text.isspace() or is_link(text) + return ( + text.isdigit() + or text.isspace() + or is_text_link(text) + or all(char in string.punctuation for char in text) + ) def _make_new_book(self, book): new_book = epub.EpubBook() @@ -206,13 +124,15 @@ class EPUBBookLoader(BaseBookLoader): temp_p = copy(p) for sup in temp_p.find_all("sup"): sup.extract() - if ( - not p.text - or self._is_special_text(temp_p.text) - or is_source(temp_p.text) - or is_list(temp_p.text) - or is_figure(temp_p.text) - or is_tail_Link(temp_p.text) + if any( + [ + not p.text, + self._is_special_text(temp_p.text), + is_text_source(temp_p.text), + is_text_list(temp_p.text), + is_text_figure(temp_p.text), + is_text_tail_link(temp_p.text), + ] ): if i == len(p_list) - 1: self.helper.deal_old(wait_p_list) diff --git a/book_maker/loader/helper.py b/book_maker/loader/helper.py new file mode 100644 index 0000000..52d1d87 --- /dev/null +++ b/book_maker/loader/helper.py @@ -0,0 +1,58 @@ +import re +from copy import copy + + +class EPUBBookLoaderHelper: + def __init__(self, translate_model, accumulated_num): + self.translate_model = translate_model + self.accumulated_num = accumulated_num + + def deal_new(self, p, wait_p_list): + self.deal_old(wait_p_list) + new_p = copy(p) + new_p.string = self.translate_model.translate(p.text) + p.insert_after(new_p) + + def deal_old(self, wait_p_list): + if not wait_p_list: + return + + result_txt_list = self.translate_model.translate_list(wait_p_list) + + for i in range(len(wait_p_list)): + if i < len(result_txt_list): + p = wait_p_list[i] + new_p = copy(p) + new_p.string = result_txt_list[i] + p.insert_after(new_p) + + wait_p_list.clear() + + +def is_text_link(text): + url_pattern = re.compile( + r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+" + ) + return bool(url_pattern.match(text.strip())) + + +def is_text_tail_link(text, num=100): + text = text.strip() + url_pattern = re.compile( + r".*http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+$" + ) + return bool(url_pattern.match(text)) and len(text) < num + + +def is_text_source(text): + return text.strip().startswith("Source: ") + + +def is_text_list(text, num=80): + text = text.strip() + return re.match(r"^Listing\s*\d+", text) and len(text) < num + + +def is_text_figure(text, num=80): + text = text.strip() + return re.match(r"^Figure\s*\d+", text) and len(text) < num diff --git a/book_maker/loader/txt_loader.py b/book_maker/loader/txt_loader.py index 0abc201..eb50bed 100644 --- a/book_maker/loader/txt_loader.py +++ b/book_maker/loader/txt_loader.py @@ -14,14 +14,9 @@ class TXTBookLoader(BaseBookLoader): key, resume, language, - batch_size, - translate_tags, - allow_navigable_strings, model_api_base=None, is_test=False, test_num=5, - accumulated_num=1, - prompt_template=None, prompt_config=None, ): self.txt_name = txt_name @@ -36,7 +31,7 @@ class TXTBookLoader(BaseBookLoader): self.bilingual_result = [] self.bilingual_temp_result = [] self.test_num = test_num - self.batch_size = batch_size + self.batch_size = 10 try: with open(f"{txt_name}", "r", encoding="utf-8") as f: diff --git a/book_maker/translator/__init__.py b/book_maker/translator/__init__.py index b345d46..9c76ea7 100644 --- a/book_maker/translator/__init__.py +++ b/book_maker/translator/__init__.py @@ -1,8 +1,8 @@ +from book_maker.translator.caiyun_translator import Caiyun from book_maker.translator.chatgptapi_translator import ChatGPTAPI +from book_maker.translator.deepl_translator import DeepL from book_maker.translator.google_translator import Google from book_maker.translator.gpt3_translator import GPT3 -from book_maker.translator.caiyun_translator import Caiyun -from book_maker.translator.deepl_translator import DeepL MODEL_DICT = { "chatgptapi": ChatGPTAPI, diff --git a/book_maker/translator/chatgptapi_translator.py b/book_maker/translator/chatgptapi_translator.py index 14a03c0..f7f03a2 100644 --- a/book_maker/translator/chatgptapi_translator.py +++ b/book_maker/translator/chatgptapi_translator.py @@ -1,5 +1,5 @@ -import time import re +import time from copy import copy from os import environ diff --git a/book_maker/translator/deepl_translator.py b/book_maker/translator/deepl_translator.py index 817c3f1..9148e4e 100644 --- a/book_maker/translator/deepl_translator.py +++ b/book_maker/translator/deepl_translator.py @@ -3,7 +3,8 @@ import time import requests -from book_maker.utils import TO_LANGUAGE_CODE, LANGUAGES +from book_maker.utils import LANGUAGES, TO_LANGUAGE_CODE + from .base_translator import Base diff --git a/book_maker/utils.py b/book_maker/utils.py index ca5ac97..fc11dc3 100644 --- a/book_maker/utils.py +++ b/book_maker/utils.py @@ -1,3 +1,5 @@ +import tiktoken + # Borrowed from : https://github.com/openai/whisper LANGUAGES = { "en": "english", @@ -126,3 +128,36 @@ def prompt_config_to_kwargs(prompt_config): prompt_template=prompt_config.get("user", None), prompt_sys_msg=prompt_config.get("system", None), ) + + +# ref: https://platform.openai.com/docs/guides/chat/introduction +def num_tokens_from_text(text, model="gpt-3.5-turbo-0301"): + messages = ( + { + "role": "user", + "content": text, + }, + ) + + """Returns the number of tokens used by a list of messages.""" + try: + encoding = tiktoken.encoding_for_model(model) + except KeyError: + encoding = tiktoken.get_encoding("cl100k_base") + if model == "gpt-3.5-turbo-0301": # note: future models may deviate from this + num_tokens = 0 + for message in messages: + num_tokens += ( + 4 # every message follows {role/name}\n{content}\n + ) + for key, value in message.items(): + num_tokens += len(encoding.encode(value)) + if key == "name": # if there's a name, the role is omitted + num_tokens += -1 # role is always required and always 1 token + num_tokens += 2 # every reply is primed with assistant + return num_tokens + else: + raise NotImplementedError( + f"""num_tokens_from_messages() is not presently implemented for model {model}. + See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""" + )