mirror of
https://github.com/yihong0618/bilingual_book_maker.git
synced 2025-06-05 19:15:34 +00:00
fix: code format
This commit is contained in:
parent
6195ede713
commit
4b51c59638
@ -186,7 +186,6 @@ So you are close to reaching the limit. You have to choose your own value, there
|
|||||||
"--batch_size",
|
"--batch_size",
|
||||||
dest="batch_size",
|
dest="batch_size",
|
||||||
type=int,
|
type=int,
|
||||||
default=10,
|
|
||||||
help="how many lines will be translated by aggregated translation(This options currently only applies to txt files)",
|
help="how many lines will be translated by aggregated translation(This options currently only applies to txt files)",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -260,12 +259,17 @@ So you are close to reaching the limit. You have to choose your own value, there
|
|||||||
model_api_base=model_api_base,
|
model_api_base=model_api_base,
|
||||||
is_test=options.test,
|
is_test=options.test,
|
||||||
test_num=options.test_num,
|
test_num=options.test_num,
|
||||||
translate_tags=options.translate_tags,
|
|
||||||
allow_navigable_strings=options.allow_navigable_strings,
|
|
||||||
accumulated_num=options.accumulated_num,
|
|
||||||
prompt_config=parse_prompt_arg(options.prompt_arg),
|
prompt_config=parse_prompt_arg(options.prompt_arg),
|
||||||
batch_size=options.batch_size,
|
|
||||||
)
|
)
|
||||||
|
# other options
|
||||||
|
if options.allow_navigable_strings:
|
||||||
|
e.allow_navigable_strings = True
|
||||||
|
if options.translate_tags:
|
||||||
|
e.translate_tags = options.translate_tags
|
||||||
|
if options.accumulated_num > 1:
|
||||||
|
e.accumulated_num = options.accumulated_num
|
||||||
|
if options.batch_size:
|
||||||
|
e.batch_size = options.batch_size
|
||||||
e.make_bilingual_book()
|
e.make_bilingual_book()
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
import os
|
import os
|
||||||
import re
|
|
||||||
import pickle
|
import pickle
|
||||||
import tiktoken
|
import string
|
||||||
import sys
|
import sys
|
||||||
from copy import copy
|
from copy import copy
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@ -12,98 +11,11 @@ from ebooklib import ITEM_DOCUMENT, epub
|
|||||||
from rich import print
|
from rich import print
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from book_maker.utils import prompt_config_to_kwargs
|
from book_maker.utils import num_tokens_from_text, prompt_config_to_kwargs
|
||||||
|
|
||||||
from .base_loader import BaseBookLoader
|
from .base_loader import BaseBookLoader
|
||||||
|
from .helper import (EPUBBookLoaderHelper, is_text_figure, is_text_link,
|
||||||
|
is_text_list, is_text_source, is_text_tail_link)
|
||||||
class EPUBBookLoaderHelper:
|
|
||||||
def __init__(self, translate_model, accumulated_num):
|
|
||||||
self.translate_model = translate_model
|
|
||||||
self.accumulated_num = accumulated_num
|
|
||||||
|
|
||||||
def deal_new(self, p, wait_p_list):
|
|
||||||
self.deal_old(wait_p_list)
|
|
||||||
new_p = copy(p)
|
|
||||||
new_p.string = self.translate_model.translate(p.text)
|
|
||||||
p.insert_after(new_p)
|
|
||||||
|
|
||||||
def deal_old(self, wait_p_list):
|
|
||||||
if len(wait_p_list) == 0:
|
|
||||||
return
|
|
||||||
|
|
||||||
result_txt_list = self.translate_model.translate_list(wait_p_list)
|
|
||||||
|
|
||||||
for i in range(len(wait_p_list)):
|
|
||||||
if i < len(result_txt_list):
|
|
||||||
p = wait_p_list[i]
|
|
||||||
new_p = copy(p)
|
|
||||||
new_p.string = result_txt_list[i]
|
|
||||||
p.insert_after(new_p)
|
|
||||||
|
|
||||||
wait_p_list.clear()
|
|
||||||
|
|
||||||
|
|
||||||
# ref: https://platform.openai.com/docs/guides/chat/introduction
|
|
||||||
def num_tokens_from_text(text, model="gpt-3.5-turbo-0301"):
|
|
||||||
messages = (
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": text,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
"""Returns the number of tokens used by a list of messages."""
|
|
||||||
try:
|
|
||||||
encoding = tiktoken.encoding_for_model(model)
|
|
||||||
except KeyError:
|
|
||||||
encoding = tiktoken.get_encoding("cl100k_base")
|
|
||||||
if model == "gpt-3.5-turbo-0301": # note: future models may deviate from this
|
|
||||||
num_tokens = 0
|
|
||||||
for message in messages:
|
|
||||||
num_tokens += (
|
|
||||||
4 # every message follows <im_start>{role/name}\n{content}<im_end>\n
|
|
||||||
)
|
|
||||||
for key, value in message.items():
|
|
||||||
num_tokens += len(encoding.encode(value))
|
|
||||||
if key == "name": # if there's a name, the role is omitted
|
|
||||||
num_tokens += -1 # role is always required and always 1 token
|
|
||||||
num_tokens += 2 # every reply is primed with <im_start>assistant
|
|
||||||
return num_tokens
|
|
||||||
else:
|
|
||||||
raise NotImplementedError(
|
|
||||||
f"""num_tokens_from_messages() is not presently implemented for model {model}.
|
|
||||||
See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def is_link(text):
|
|
||||||
url_pattern = re.compile(
|
|
||||||
r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
|
|
||||||
)
|
|
||||||
return bool(url_pattern.match(text.strip()))
|
|
||||||
|
|
||||||
|
|
||||||
def is_tail_Link(text, num=100):
|
|
||||||
text = text.strip()
|
|
||||||
url_pattern = re.compile(
|
|
||||||
r".*http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+$"
|
|
||||||
)
|
|
||||||
return bool(url_pattern.match(text)) and len(text) < num
|
|
||||||
|
|
||||||
|
|
||||||
def is_source(text):
|
|
||||||
return text.strip().startswith("Source: ")
|
|
||||||
|
|
||||||
|
|
||||||
def is_list(text, num=80):
|
|
||||||
text = text.strip()
|
|
||||||
return re.match(r"^Listing\s*\d+", text) and len(text) < num
|
|
||||||
|
|
||||||
|
|
||||||
def is_figure(text, num=80):
|
|
||||||
text = text.strip()
|
|
||||||
return re.match(r"^Figure\s*\d+", text) and len(text) < num
|
|
||||||
|
|
||||||
|
|
||||||
class EPUBBookLoader(BaseBookLoader):
|
class EPUBBookLoader(BaseBookLoader):
|
||||||
@ -114,14 +26,9 @@ class EPUBBookLoader(BaseBookLoader):
|
|||||||
key,
|
key,
|
||||||
resume,
|
resume,
|
||||||
language,
|
language,
|
||||||
batch_size,
|
|
||||||
model_api_base=None,
|
model_api_base=None,
|
||||||
is_test=False,
|
is_test=False,
|
||||||
test_num=5,
|
test_num=5,
|
||||||
translate_tags="p",
|
|
||||||
allow_navigable_strings=False,
|
|
||||||
accumulated_num=1,
|
|
||||||
prompt_template=None,
|
|
||||||
prompt_config=None,
|
prompt_config=None,
|
||||||
):
|
):
|
||||||
self.epub_name = epub_name
|
self.epub_name = epub_name
|
||||||
@ -134,9 +41,9 @@ class EPUBBookLoader(BaseBookLoader):
|
|||||||
)
|
)
|
||||||
self.is_test = is_test
|
self.is_test = is_test
|
||||||
self.test_num = test_num
|
self.test_num = test_num
|
||||||
self.translate_tags = translate_tags
|
self.translate_tags = "p"
|
||||||
self.allow_navigable_strings = allow_navigable_strings
|
self.allow_navigable_strings = False
|
||||||
self.accumulated_num = accumulated_num
|
self.accumulated_num = 1
|
||||||
self.helper = EPUBBookLoaderHelper(self.translate_model, self.accumulated_num)
|
self.helper = EPUBBookLoaderHelper(self.translate_model, self.accumulated_num)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -165,7 +72,12 @@ class EPUBBookLoader(BaseBookLoader):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _is_special_text(text):
|
def _is_special_text(text):
|
||||||
return text.isdigit() or text.isspace() or is_link(text)
|
return (
|
||||||
|
text.isdigit()
|
||||||
|
or text.isspace()
|
||||||
|
or is_text_link(text)
|
||||||
|
or all(char in string.punctuation for char in text)
|
||||||
|
)
|
||||||
|
|
||||||
def _make_new_book(self, book):
|
def _make_new_book(self, book):
|
||||||
new_book = epub.EpubBook()
|
new_book = epub.EpubBook()
|
||||||
@ -206,13 +118,15 @@ class EPUBBookLoader(BaseBookLoader):
|
|||||||
temp_p = copy(p)
|
temp_p = copy(p)
|
||||||
for sup in temp_p.find_all("sup"):
|
for sup in temp_p.find_all("sup"):
|
||||||
sup.extract()
|
sup.extract()
|
||||||
if (
|
if any(
|
||||||
not p.text
|
[
|
||||||
or self._is_special_text(temp_p.text)
|
not p.text,
|
||||||
or is_source(temp_p.text)
|
self._is_special_text(temp_p.text),
|
||||||
or is_list(temp_p.text)
|
is_text_source(temp_p.text),
|
||||||
or is_figure(temp_p.text)
|
is_text_list(temp_p.text),
|
||||||
or is_tail_Link(temp_p.text)
|
is_text_figure(temp_p.text),
|
||||||
|
is_text_tail_link(temp_p.text),
|
||||||
|
]
|
||||||
):
|
):
|
||||||
if i == len(p_list) - 1:
|
if i == len(p_list) - 1:
|
||||||
self.helper.deal_old(wait_p_list)
|
self.helper.deal_old(wait_p_list)
|
||||||
|
58
book_maker/loader/helper.py
Normal file
58
book_maker/loader/helper.py
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
import re
|
||||||
|
from copy import copy
|
||||||
|
|
||||||
|
|
||||||
|
class EPUBBookLoaderHelper:
|
||||||
|
def __init__(self, translate_model, accumulated_num):
|
||||||
|
self.translate_model = translate_model
|
||||||
|
self.accumulated_num = accumulated_num
|
||||||
|
|
||||||
|
def deal_new(self, p, wait_p_list):
|
||||||
|
self.deal_old(wait_p_list)
|
||||||
|
new_p = copy(p)
|
||||||
|
new_p.string = self.translate_model.translate(p.text)
|
||||||
|
p.insert_after(new_p)
|
||||||
|
|
||||||
|
def deal_old(self, wait_p_list):
|
||||||
|
if not wait_p_list:
|
||||||
|
return
|
||||||
|
|
||||||
|
result_txt_list = self.translate_model.translate_list(wait_p_list)
|
||||||
|
|
||||||
|
for i in range(len(wait_p_list)):
|
||||||
|
if i < len(result_txt_list):
|
||||||
|
p = wait_p_list[i]
|
||||||
|
new_p = copy(p)
|
||||||
|
new_p.string = result_txt_list[i]
|
||||||
|
p.insert_after(new_p)
|
||||||
|
|
||||||
|
wait_p_list.clear()
|
||||||
|
|
||||||
|
|
||||||
|
def is_text_link(text):
|
||||||
|
url_pattern = re.compile(
|
||||||
|
r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
|
||||||
|
)
|
||||||
|
return bool(url_pattern.match(text.strip()))
|
||||||
|
|
||||||
|
|
||||||
|
def is_text_tail_link(text, num=100):
|
||||||
|
text = text.strip()
|
||||||
|
url_pattern = re.compile(
|
||||||
|
r".*http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+$"
|
||||||
|
)
|
||||||
|
return bool(url_pattern.match(text)) and len(text) < num
|
||||||
|
|
||||||
|
|
||||||
|
def is_text_source(text):
|
||||||
|
return text.strip().startswith("Source: ")
|
||||||
|
|
||||||
|
|
||||||
|
def is_text_list(text, num=80):
|
||||||
|
text = text.strip()
|
||||||
|
return re.match(r"^Listing\s*\d+", text) and len(text) < num
|
||||||
|
|
||||||
|
|
||||||
|
def is_text_figure(text, num=80):
|
||||||
|
text = text.strip()
|
||||||
|
return re.match(r"^Figure\s*\d+", text) and len(text) < num
|
@ -14,14 +14,9 @@ class TXTBookLoader(BaseBookLoader):
|
|||||||
key,
|
key,
|
||||||
resume,
|
resume,
|
||||||
language,
|
language,
|
||||||
batch_size,
|
|
||||||
translate_tags,
|
|
||||||
allow_navigable_strings,
|
|
||||||
model_api_base=None,
|
model_api_base=None,
|
||||||
is_test=False,
|
is_test=False,
|
||||||
test_num=5,
|
test_num=5,
|
||||||
accumulated_num=1,
|
|
||||||
prompt_template=None,
|
|
||||||
prompt_config=None,
|
prompt_config=None,
|
||||||
):
|
):
|
||||||
self.txt_name = txt_name
|
self.txt_name = txt_name
|
||||||
@ -36,7 +31,7 @@ class TXTBookLoader(BaseBookLoader):
|
|||||||
self.bilingual_result = []
|
self.bilingual_result = []
|
||||||
self.bilingual_temp_result = []
|
self.bilingual_temp_result = []
|
||||||
self.test_num = test_num
|
self.test_num = test_num
|
||||||
self.batch_size = batch_size
|
self.batch_size = 10
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with open(f"{txt_name}", "r", encoding="utf-8") as f:
|
with open(f"{txt_name}", "r", encoding="utf-8") as f:
|
||||||
|
@ -199,18 +199,8 @@ class ENCRYPTIONError(Exception):
|
|||||||
|
|
||||||
|
|
||||||
def _load_crypto_libcrypto():
|
def _load_crypto_libcrypto():
|
||||||
from ctypes import (
|
from ctypes import (CDLL, POINTER, Structure, c_char_p, c_int, c_long,
|
||||||
CDLL,
|
c_ulong, c_void_p, cast, create_string_buffer)
|
||||||
POINTER,
|
|
||||||
Structure,
|
|
||||||
c_char_p,
|
|
||||||
c_int,
|
|
||||||
c_long,
|
|
||||||
c_ulong,
|
|
||||||
c_void_p,
|
|
||||||
cast,
|
|
||||||
create_string_buffer,
|
|
||||||
)
|
|
||||||
from ctypes.util import find_library
|
from ctypes.util import find_library
|
||||||
|
|
||||||
if sys.platform.startswith("win"):
|
if sys.platform.startswith("win"):
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
|
from book_maker.translator.caiyun_translator import Caiyun
|
||||||
from book_maker.translator.chatgptapi_translator import ChatGPTAPI
|
from book_maker.translator.chatgptapi_translator import ChatGPTAPI
|
||||||
|
from book_maker.translator.deepl_translator import DeepL
|
||||||
from book_maker.translator.google_translator import Google
|
from book_maker.translator.google_translator import Google
|
||||||
from book_maker.translator.gpt3_translator import GPT3
|
from book_maker.translator.gpt3_translator import GPT3
|
||||||
from book_maker.translator.caiyun_translator import Caiyun
|
|
||||||
from book_maker.translator.deepl_translator import DeepL
|
|
||||||
|
|
||||||
MODEL_DICT = {
|
MODEL_DICT = {
|
||||||
"chatgptapi": ChatGPTAPI,
|
"chatgptapi": ChatGPTAPI,
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
import time
|
|
||||||
import re
|
import re
|
||||||
|
import time
|
||||||
from copy import copy
|
from copy import copy
|
||||||
from os import environ
|
from os import environ
|
||||||
|
|
||||||
|
@ -3,7 +3,8 @@ import time
|
|||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from book_maker.utils import TO_LANGUAGE_CODE, LANGUAGES
|
from book_maker.utils import LANGUAGES, TO_LANGUAGE_CODE
|
||||||
|
|
||||||
from .base_translator import Base
|
from .base_translator import Base
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
import tiktoken
|
||||||
|
|
||||||
# Borrowed from : https://github.com/openai/whisper
|
# Borrowed from : https://github.com/openai/whisper
|
||||||
LANGUAGES = {
|
LANGUAGES = {
|
||||||
"en": "english",
|
"en": "english",
|
||||||
@ -126,3 +128,36 @@ def prompt_config_to_kwargs(prompt_config):
|
|||||||
prompt_template=prompt_config.get("user", None),
|
prompt_template=prompt_config.get("user", None),
|
||||||
prompt_sys_msg=prompt_config.get("system", None),
|
prompt_sys_msg=prompt_config.get("system", None),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ref: https://platform.openai.com/docs/guides/chat/introduction
|
||||||
|
def num_tokens_from_text(text, model="gpt-3.5-turbo-0301"):
|
||||||
|
messages = (
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": text,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
"""Returns the number of tokens used by a list of messages."""
|
||||||
|
try:
|
||||||
|
encoding = tiktoken.encoding_for_model(model)
|
||||||
|
except KeyError:
|
||||||
|
encoding = tiktoken.get_encoding("cl100k_base")
|
||||||
|
if model == "gpt-3.5-turbo-0301": # note: future models may deviate from this
|
||||||
|
num_tokens = 0
|
||||||
|
for message in messages:
|
||||||
|
num_tokens += (
|
||||||
|
4 # every message follows <im_start>{role/name}\n{content}<im_end>\n
|
||||||
|
)
|
||||||
|
for key, value in message.items():
|
||||||
|
num_tokens += len(encoding.encode(value))
|
||||||
|
if key == "name": # if there's a name, the role is omitted
|
||||||
|
num_tokens += -1 # role is always required and always 1 token
|
||||||
|
num_tokens += 2 # every reply is primed with <im_start>assistant
|
||||||
|
return num_tokens
|
||||||
|
else:
|
||||||
|
raise NotImplementedError(
|
||||||
|
f"""num_tokens_from_messages() is not presently implemented for model {model}.
|
||||||
|
See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
|
||||||
|
)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user