feat: add txt book loader (#143)

* feat: add txt book loader

* chore: add test book

* style: black

* feat: add _save_temp_book

* doc: add txt support desc
This commit is contained in:
zstone12 2023-03-11 14:37:25 +08:00 committed by GitHub
parent d95f0b3942
commit aaa1ab4d7b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 3280 additions and 6 deletions

View File

@ -33,6 +33,11 @@ jobs:
python3 make_book.py --book_name "test_books/Liber_Esther.epub" --test --test_num 10 --model google --translate-tags div,p
python3 make_book.py --book_name "test_books/Liber_Esther.epub" --test --test_num 20 --model google
- name: make txt book test using google translate
run: |
python3 make_book.py --book_name "test_books/the_little_prince.txt" --test --test_num 20 --model google
- name: make openai key ebook test
if: env.OPENAI_API_KEY != null
run: |

View File

@ -1,6 +1,6 @@
# bilingual_book_maker
bilingual_book_maker 是一个 AI 翻译工具,使用 ChatGPT 帮助用户制作多语言版本的 epub 文件和图书。该工具仅适用于翻译进入公共版权领域的 epub 图书,不适用于有版权的书籍。请在使用之前阅读项目的 **[免责声明](./disclaimer.md)**。
bilingual_book_maker 是一个 AI 翻译工具,使用 ChatGPT 帮助用户制作多语言版本的 epub/txt 文件和图书。该工具仅适用于翻译进入公共版权领域的 epub/txt 图书,不适用于有版权的书籍。请在使用之前阅读项目的 **[免责声明](./disclaimer.md)**。
![image](https://user-images.githubusercontent.com/15976103/222317531-a05317c5-4eee-49de-95cd-04063d9539d9.png)
@ -8,7 +8,7 @@ bilingual_book_maker 是一个 AI 翻译工具,使用 ChatGPT 帮助用户制
## 准备
1. ChatGPT or OpenAI token [^token]
2. epub books
2. epub/txt books
3. 能正常联网的环境或 proxy
4. python3.8+
@ -50,6 +50,9 @@ python3 make_book.py --book_name test_books/animal_farm.epub --model gpt3 --lang
# Translate contents in <div> and <p>
python3 make_book.py --book_name test_books/animal_farm.epub --translate-tags div,p
# 翻译 txt 文件
python3 make_book.py --book_name test_books/the_little_prince.txt -openai_key ${openai_key} --test
```
更加小白的示例

View File

@ -5,7 +5,7 @@ Usage: make sure to add `--model google` in the command.
**[中文](./README-CN.md) | English**
# bilingual_book_maker
The bilingual_book_maker is an AI translation tool that uses ChatGPT to assist users in creating multi-language versions of epub files and books. This tool is exclusively designed for translating epub books that have entered the public domain and is not intended for copyrighted works. Before using this tool, please review the project's **[disclaimer](./disclaimer.md)**.
The bilingual_book_maker is an AI translation tool that uses ChatGPT to assist users in creating multi-language versions of epub/txt files and books. This tool is exclusively designed for translating epub books that have entered the public domain and is not intended for copyrighted works. Before using this tool, please review the project's **[disclaimer](./disclaimer.md)**.
![image](https://user-images.githubusercontent.com/15976103/222317531-a05317c5-4eee-49de-95cd-04063d9539d9.png)
@ -13,7 +13,7 @@ The bilingual_book_maker is an AI translation tool that uses ChatGPT to assist u
## Preparation
1. ChatGPT or OpenAI token [^token]
2. epub books
2. epub/txt books
3. Environment with internet access or proxy
4. Python 3.8+
@ -55,6 +55,9 @@ python3 make_book.py --book_name test_books/animal_farm.epub --model gpt3 --lang
# Translate contents in <div> and <p>
python3 make_book.py --book_name test_books/animal_farm.epub --translate-tags div,p
# translate txt file
python3 make_book.py --book_name test_books/the_little_prince.txt -openai_key ${openai_key} --test --language zh-hans
```
More understandable example

View File

@ -1,6 +1,9 @@
from book_maker.loader.epub_loader import EPUBBookLoader
from book_maker.loader.txt_loader import TXTBookLoader
BOOK_LOADER_DICT = {
"epub": EPUBBookLoader
"epub": EPUBBookLoader,
"txt": TXTBookLoader
# TODO add more here
}

View File

@ -1 +1,113 @@
"""TODO"""
import sys
from pathlib import Path
from .base_loader import BaseBookLoader
class TXTBookLoader(BaseBookLoader):
def __init__(
self,
txt_name,
model,
key,
resume,
language,
translate_tags,
allow_navigable_strings,
model_api_base=None,
is_test=False,
test_num=5,
):
self.txt_name = txt_name
self.translate_model = model(key, language, model_api_base)
self.is_test = is_test
self.p_to_save = []
self.bilingual_result = []
self.bilingual_temp_result = []
self.test_num = test_num
try:
with open(f"{txt_name}", "r", encoding="utf-8") as f:
self.origin_book = f.read().split("\n")
except Exception:
raise Exception("can not load file")
self.resume = resume
self.bin_path = f"{Path(txt_name).parent}/.{Path(txt_name).stem}.temp.bin"
if self.resume:
self.load_state()
@staticmethod
def _is_special_text(text):
return text.isdigit() or text.isspace() or len(text) == 0
def _make_new_book(self, book):
pass
def make_bilingual_book(self):
index = 0
p_to_save_len = len(self.p_to_save)
try:
for i in self.origin_book:
if self._is_special_text(i):
continue
if self.resume and index < p_to_save_len:
pass
else:
temp = self.translate_model.translate(i)
self.p_to_save.append(temp)
self.bilingual_result.append(i)
self.bilingual_result.append(temp)
index += 1
if self.is_test and index > self.test_num:
break
self.save_file(
f"{Path(self.txt_name).parent}/{Path(self.txt_name).stem}_bilingual.txt",
self.bilingual_result,
)
except (KeyboardInterrupt, Exception) as e:
print(e)
print("you can resume it next time")
self._save_progress()
self._save_temp_book()
sys.exit(0)
def _save_temp_book(self):
index = 0
for i in range(0, len(self.origin_book)):
self.bilingual_temp_result.append(self.origin_book[i])
if self._is_special_text(self.origin_book[i]):
continue
if index < len(self.p_to_save):
self.bilingual_temp_result.append(self.p_to_save[index])
index += 1
self.save_file(
f"{Path(self.txt_name).parent}/{Path(self.txt_name).stem}_bilingual_temp.txt",
self.bilingual_temp_result,
)
def _save_progress(self):
try:
with open(self.bin_path, "w") as f:
f.write("\n".join(self.p_to_save))
except:
raise Exception("can not save resume file")
def load_state(self):
try:
with open(self.bin_path, "r", encoding="utf-8") as f:
self.p_to_save = f.read().split("\n")
except Exception:
raise Exception("can not load resume file")
def save_file(self, book_path, content):
try:
with open(book_path, "w", encoding="utf-8") as f:
f.write("\n".join(content))
except:
raise Exception("can not save file")

File diff suppressed because it is too large Load Diff