feat: batch translate txt file (#153)

* feat: batch translate txt file

* feat: batch size customed

* reslove conflicts
This commit is contained in:
zstone12 2023-03-12 23:03:28 +08:00 committed by GitHub
parent f09c2717f1
commit 5d2c89a841
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 40 additions and 8 deletions

View File

@ -37,6 +37,10 @@ jobs:
run: |
python3 make_book.py --book_name "test_books/the_little_prince.txt" --test --test_num 20 --model google
- name: make txt book test with batch_size
run: |
python3 make_book.py --book_name "test_books/the_little_prince.txt" --test --batch_size 30 --test_num 20 --model google
- name: make openai key ebook test
if: env.OPENAI_API_KEY != null

View File

@ -42,7 +42,7 @@ bilingual_book_maker 是一个 AI 翻译工具,使用 ChatGPT 帮助用户制
16. 翻译完会生成一本 ${book_name}_bilingual.epub 的双语书
17. 如果出现了错误或使用 `CTRL+C` 中断命令,不想接下来继续翻译了,会生成一本 ${book_name}_bilingual_temp.epub 的书,直接改成你想要的名字就可以了
18. 如果你想要翻译电子书中的无标签字符串,可以使用 `--allow_navigable_strings` 参数,会将可遍历字符串加入翻译队列,**注意,在条件允许情况下,请寻找更规范的电子书**
19. 使用`--batch_size` 参数,指定批量翻译的行数(默认行数为10目前只对txt生效)
### 示范用例
**如果使用 `pip install bbook_maker` 以下命令都可以改成 `bbook args`**
@ -72,6 +72,9 @@ python3 make_book.py --book_from kobo --device_path /tmp/kobo
# 翻译 txt 文件
python3 make_book.py --book_name test_books/the_little_prince.txt --test
# 聚合多行翻译 txt 文件
python3 make_book.py --book_name test_books/the_little_prince.txt --test --batch_size 20
```
更加小白的示例

View File

@ -41,6 +41,7 @@ The bilingual_book_maker is an AI translation tool that uses ChatGPT to assist u
16. Once the translation is complete, a bilingual book named `${book_name}_bilingual.epub` would be generated.
17. If there are any errors or you wish to interrupt the translation by pressing `CTRL+C`. A book named `${book_name}_bilingual_temp.epub` would be generated. You can simply rename it to any desired name.
18. If you want to translate strings in an e-book that aren't labeled with any tags, you can use the `--allow_navigable_strings` parameter. This will add the strings to the translation queue. **Note that it's best to look for e-books that are more standardized if possible.**
19. Use the `--batch_size` parameter to specify the number of lines for batch translation (default is 10, currently only effective for txt files).
### Examples
@ -74,6 +75,8 @@ python3 make_book.py --book_from kobo --device_path /tmp/kobo
# translate txt file
python3 make_book.py --book_name test_books/the_little_prince.txt --test --language zh-hans
# aggregated translation txt file
python3 make_book.py --book_name test_books/the_little_prince.txt --test --batch_size 20
```
More understandable example

View File

@ -156,6 +156,13 @@ def main():
metavar="PROMPT_ARG",
help="used for customizing the prompt. It can be the prompt template string, or a path to the template file. The valid placeholders are `{text}` and `{language}`.",
)
parser.add_argument(
"--batch_size",
dest="batch_size",
type=int,
default=10,
help="how many lines will be translated by aggregated translation(This options currently only applies to txt files)",
)
options = parser.parse_args()
PROXY = options.proxy
@ -219,6 +226,7 @@ def main():
translate_tags=options.translate_tags,
allow_navigable_strings=options.allow_navigable_strings,
prompt_config=parse_prompt_arg(options.prompt_arg),
batch_size=options.batch_size,
)
e.make_bilingual_book()

View File

@ -22,6 +22,7 @@ class EPUBBookLoader(BaseBookLoader):
key,
resume,
language,
batch_size,
model_api_base=None,
is_test=False,
test_num=5,

View File

@ -14,6 +14,7 @@ class TXTBookLoader(BaseBookLoader):
key,
resume,
language,
batch_size,
translate_tags,
allow_navigable_strings,
model_api_base=None,
@ -33,6 +34,7 @@ class TXTBookLoader(BaseBookLoader):
self.bilingual_result = []
self.bilingual_temp_result = []
self.test_num = test_num
self.batch_size = batch_size
try:
with open(f"{txt_name}", "r", encoding="utf-8") as f:
@ -58,17 +60,22 @@ class TXTBookLoader(BaseBookLoader):
p_to_save_len = len(self.p_to_save)
try:
for i in self.origin_book:
if self._is_special_text(i):
sliced_list = [
self.origin_book[i : i + self.batch_size]
for i in range(0, len(self.origin_book), self.batch_size)
]
for i in sliced_list:
batch_text = "".join(i)
if self._is_special_text(batch_text):
continue
if self.resume and index < p_to_save_len:
pass
else:
temp = self.translate_model.translate(i)
temp = self.translate_model.translate(batch_text)
self.p_to_save.append(temp)
self.bilingual_result.append(i)
self.bilingual_result.append(batch_text)
self.bilingual_result.append(temp)
index += 1
index += self.batch_size
if self.is_test and index > self.test_num:
break
@ -86,8 +93,14 @@ class TXTBookLoader(BaseBookLoader):
def _save_temp_book(self):
index = 0
for i in range(0, len(self.origin_book)):
self.bilingual_temp_result.append(self.origin_book[i])
sliced_list = [
self.origin_book[i : i + self.batch_size]
for i in range(0, len(self.origin_book), self.batch_size)
]
for i in range(0, len(sliced_list)):
batch_text = "".join(sliced_list[i])
self.bilingual_temp_result.append(batch_text)
if self._is_special_text(self.origin_book[i]):
continue
if index < len(self.p_to_save):