fix: code format

This commit is contained in:
yihong0618 2023-03-17 22:35:19 +08:00
parent 6195ede713
commit 4b51c59638
9 changed files with 132 additions and 135 deletions

View File

@ -186,7 +186,6 @@ So you are close to reaching the limit. You have to choose your own value, there
"--batch_size", "--batch_size",
dest="batch_size", dest="batch_size",
type=int, type=int,
default=10,
help="how many lines will be translated by aggregated translation(This options currently only applies to txt files)", help="how many lines will be translated by aggregated translation(This options currently only applies to txt files)",
) )
@ -260,12 +259,17 @@ So you are close to reaching the limit. You have to choose your own value, there
model_api_base=model_api_base, model_api_base=model_api_base,
is_test=options.test, is_test=options.test,
test_num=options.test_num, test_num=options.test_num,
translate_tags=options.translate_tags,
allow_navigable_strings=options.allow_navigable_strings,
accumulated_num=options.accumulated_num,
prompt_config=parse_prompt_arg(options.prompt_arg), prompt_config=parse_prompt_arg(options.prompt_arg),
batch_size=options.batch_size,
) )
# other options
if options.allow_navigable_strings:
e.allow_navigable_strings = True
if options.translate_tags:
e.translate_tags = options.translate_tags
if options.accumulated_num > 1:
e.accumulated_num = options.accumulated_num
if options.batch_size:
e.batch_size = options.batch_size
e.make_bilingual_book() e.make_bilingual_book()

View File

@ -1,7 +1,6 @@
import os import os
import re
import pickle import pickle
import tiktoken import string
import sys import sys
from copy import copy from copy import copy
from pathlib import Path from pathlib import Path
@ -12,98 +11,11 @@ from ebooklib import ITEM_DOCUMENT, epub
from rich import print from rich import print
from tqdm import tqdm from tqdm import tqdm
from book_maker.utils import prompt_config_to_kwargs from book_maker.utils import num_tokens_from_text, prompt_config_to_kwargs
from .base_loader import BaseBookLoader from .base_loader import BaseBookLoader
from .helper import (EPUBBookLoaderHelper, is_text_figure, is_text_link,
is_text_list, is_text_source, is_text_tail_link)
class EPUBBookLoaderHelper:
def __init__(self, translate_model, accumulated_num):
self.translate_model = translate_model
self.accumulated_num = accumulated_num
def deal_new(self, p, wait_p_list):
self.deal_old(wait_p_list)
new_p = copy(p)
new_p.string = self.translate_model.translate(p.text)
p.insert_after(new_p)
def deal_old(self, wait_p_list):
if len(wait_p_list) == 0:
return
result_txt_list = self.translate_model.translate_list(wait_p_list)
for i in range(len(wait_p_list)):
if i < len(result_txt_list):
p = wait_p_list[i]
new_p = copy(p)
new_p.string = result_txt_list[i]
p.insert_after(new_p)
wait_p_list.clear()
# ref: https://platform.openai.com/docs/guides/chat/introduction
def num_tokens_from_text(text, model="gpt-3.5-turbo-0301"):
messages = (
{
"role": "user",
"content": text,
},
)
"""Returns the number of tokens used by a list of messages."""
try:
encoding = tiktoken.encoding_for_model(model)
except KeyError:
encoding = tiktoken.get_encoding("cl100k_base")
if model == "gpt-3.5-turbo-0301": # note: future models may deviate from this
num_tokens = 0
for message in messages:
num_tokens += (
4 # every message follows <im_start>{role/name}\n{content}<im_end>\n
)
for key, value in message.items():
num_tokens += len(encoding.encode(value))
if key == "name": # if there's a name, the role is omitted
num_tokens += -1 # role is always required and always 1 token
num_tokens += 2 # every reply is primed with <im_start>assistant
return num_tokens
else:
raise NotImplementedError(
f"""num_tokens_from_messages() is not presently implemented for model {model}.
See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
)
def is_link(text):
url_pattern = re.compile(
r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
)
return bool(url_pattern.match(text.strip()))
def is_tail_Link(text, num=100):
text = text.strip()
url_pattern = re.compile(
r".*http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+$"
)
return bool(url_pattern.match(text)) and len(text) < num
def is_source(text):
return text.strip().startswith("Source: ")
def is_list(text, num=80):
text = text.strip()
return re.match(r"^Listing\s*\d+", text) and len(text) < num
def is_figure(text, num=80):
text = text.strip()
return re.match(r"^Figure\s*\d+", text) and len(text) < num
class EPUBBookLoader(BaseBookLoader): class EPUBBookLoader(BaseBookLoader):
@ -114,14 +26,9 @@ class EPUBBookLoader(BaseBookLoader):
key, key,
resume, resume,
language, language,
batch_size,
model_api_base=None, model_api_base=None,
is_test=False, is_test=False,
test_num=5, test_num=5,
translate_tags="p",
allow_navigable_strings=False,
accumulated_num=1,
prompt_template=None,
prompt_config=None, prompt_config=None,
): ):
self.epub_name = epub_name self.epub_name = epub_name
@ -134,9 +41,9 @@ class EPUBBookLoader(BaseBookLoader):
) )
self.is_test = is_test self.is_test = is_test
self.test_num = test_num self.test_num = test_num
self.translate_tags = translate_tags self.translate_tags = "p"
self.allow_navigable_strings = allow_navigable_strings self.allow_navigable_strings = False
self.accumulated_num = accumulated_num self.accumulated_num = 1
self.helper = EPUBBookLoaderHelper(self.translate_model, self.accumulated_num) self.helper = EPUBBookLoaderHelper(self.translate_model, self.accumulated_num)
try: try:
@ -165,7 +72,12 @@ class EPUBBookLoader(BaseBookLoader):
@staticmethod @staticmethod
def _is_special_text(text): def _is_special_text(text):
return text.isdigit() or text.isspace() or is_link(text) return (
text.isdigit()
or text.isspace()
or is_text_link(text)
or all(char in string.punctuation for char in text)
)
def _make_new_book(self, book): def _make_new_book(self, book):
new_book = epub.EpubBook() new_book = epub.EpubBook()
@ -206,13 +118,15 @@ class EPUBBookLoader(BaseBookLoader):
temp_p = copy(p) temp_p = copy(p)
for sup in temp_p.find_all("sup"): for sup in temp_p.find_all("sup"):
sup.extract() sup.extract()
if ( if any(
not p.text [
or self._is_special_text(temp_p.text) not p.text,
or is_source(temp_p.text) self._is_special_text(temp_p.text),
or is_list(temp_p.text) is_text_source(temp_p.text),
or is_figure(temp_p.text) is_text_list(temp_p.text),
or is_tail_Link(temp_p.text) is_text_figure(temp_p.text),
is_text_tail_link(temp_p.text),
]
): ):
if i == len(p_list) - 1: if i == len(p_list) - 1:
self.helper.deal_old(wait_p_list) self.helper.deal_old(wait_p_list)

View File

@ -0,0 +1,58 @@
import re
from copy import copy
class EPUBBookLoaderHelper:
def __init__(self, translate_model, accumulated_num):
self.translate_model = translate_model
self.accumulated_num = accumulated_num
def deal_new(self, p, wait_p_list):
self.deal_old(wait_p_list)
new_p = copy(p)
new_p.string = self.translate_model.translate(p.text)
p.insert_after(new_p)
def deal_old(self, wait_p_list):
if not wait_p_list:
return
result_txt_list = self.translate_model.translate_list(wait_p_list)
for i in range(len(wait_p_list)):
if i < len(result_txt_list):
p = wait_p_list[i]
new_p = copy(p)
new_p.string = result_txt_list[i]
p.insert_after(new_p)
wait_p_list.clear()
def is_text_link(text):
url_pattern = re.compile(
r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
)
return bool(url_pattern.match(text.strip()))
def is_text_tail_link(text, num=100):
text = text.strip()
url_pattern = re.compile(
r".*http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+$"
)
return bool(url_pattern.match(text)) and len(text) < num
def is_text_source(text):
return text.strip().startswith("Source: ")
def is_text_list(text, num=80):
text = text.strip()
return re.match(r"^Listing\s*\d+", text) and len(text) < num
def is_text_figure(text, num=80):
text = text.strip()
return re.match(r"^Figure\s*\d+", text) and len(text) < num

View File

@ -14,14 +14,9 @@ class TXTBookLoader(BaseBookLoader):
key, key,
resume, resume,
language, language,
batch_size,
translate_tags,
allow_navigable_strings,
model_api_base=None, model_api_base=None,
is_test=False, is_test=False,
test_num=5, test_num=5,
accumulated_num=1,
prompt_template=None,
prompt_config=None, prompt_config=None,
): ):
self.txt_name = txt_name self.txt_name = txt_name
@ -36,7 +31,7 @@ class TXTBookLoader(BaseBookLoader):
self.bilingual_result = [] self.bilingual_result = []
self.bilingual_temp_result = [] self.bilingual_temp_result = []
self.test_num = test_num self.test_num = test_num
self.batch_size = batch_size self.batch_size = 10
try: try:
with open(f"{txt_name}", "r", encoding="utf-8") as f: with open(f"{txt_name}", "r", encoding="utf-8") as f:

View File

@ -199,18 +199,8 @@ class ENCRYPTIONError(Exception):
def _load_crypto_libcrypto(): def _load_crypto_libcrypto():
from ctypes import ( from ctypes import (CDLL, POINTER, Structure, c_char_p, c_int, c_long,
CDLL, c_ulong, c_void_p, cast, create_string_buffer)
POINTER,
Structure,
c_char_p,
c_int,
c_long,
c_ulong,
c_void_p,
cast,
create_string_buffer,
)
from ctypes.util import find_library from ctypes.util import find_library
if sys.platform.startswith("win"): if sys.platform.startswith("win"):

View File

@ -1,8 +1,8 @@
from book_maker.translator.caiyun_translator import Caiyun
from book_maker.translator.chatgptapi_translator import ChatGPTAPI from book_maker.translator.chatgptapi_translator import ChatGPTAPI
from book_maker.translator.deepl_translator import DeepL
from book_maker.translator.google_translator import Google from book_maker.translator.google_translator import Google
from book_maker.translator.gpt3_translator import GPT3 from book_maker.translator.gpt3_translator import GPT3
from book_maker.translator.caiyun_translator import Caiyun
from book_maker.translator.deepl_translator import DeepL
MODEL_DICT = { MODEL_DICT = {
"chatgptapi": ChatGPTAPI, "chatgptapi": ChatGPTAPI,

View File

@ -1,5 +1,5 @@
import time
import re import re
import time
from copy import copy from copy import copy
from os import environ from os import environ

View File

@ -3,7 +3,8 @@ import time
import requests import requests
from book_maker.utils import TO_LANGUAGE_CODE, LANGUAGES from book_maker.utils import LANGUAGES, TO_LANGUAGE_CODE
from .base_translator import Base from .base_translator import Base

View File

@ -1,3 +1,5 @@
import tiktoken
# Borrowed from : https://github.com/openai/whisper # Borrowed from : https://github.com/openai/whisper
LANGUAGES = { LANGUAGES = {
"en": "english", "en": "english",
@ -126,3 +128,36 @@ def prompt_config_to_kwargs(prompt_config):
prompt_template=prompt_config.get("user", None), prompt_template=prompt_config.get("user", None),
prompt_sys_msg=prompt_config.get("system", None), prompt_sys_msg=prompt_config.get("system", None),
) )
# ref: https://platform.openai.com/docs/guides/chat/introduction
def num_tokens_from_text(text, model="gpt-3.5-turbo-0301"):
messages = (
{
"role": "user",
"content": text,
},
)
"""Returns the number of tokens used by a list of messages."""
try:
encoding = tiktoken.encoding_for_model(model)
except KeyError:
encoding = tiktoken.get_encoding("cl100k_base")
if model == "gpt-3.5-turbo-0301": # note: future models may deviate from this
num_tokens = 0
for message in messages:
num_tokens += (
4 # every message follows <im_start>{role/name}\n{content}<im_end>\n
)
for key, value in message.items():
num_tokens += len(encoding.encode(value))
if key == "name": # if there's a name, the role is omitted
num_tokens += -1 # role is always required and always 1 token
num_tokens += 2 # every reply is primed with <im_start>assistant
return num_tokens
else:
raise NotImplementedError(
f"""num_tokens_from_messages() is not presently implemented for model {model}.
See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
)