Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
This commit is contained in:
Ruihang Xia 2025-07-26 17:16:05 -07:00
parent 9579ac20b9
commit e540d9d6dc
5 changed files with 290 additions and 3 deletions

View File

@ -71,6 +71,17 @@ bbook --book_name test_books/animal_farm.epub --openai_key ${openai_key} --test
python3 make_book.py --book_name test_books/animal_farm.epub --model gemini --gemini_key ${gemini_key}
```
* Qwen
使用 [Qwen](https://www.aliyun.com/product/dashscope) 模型进行翻译,支持 qwen-mt-turbo 和 qwen-mt-plus 模型。
使用 `--source_lang` 指定源语言,留空为自动检测。
```shell
python3 make_book.py --book_name test_books/animal_farm.epub --qwen_key ${qwen_key} --model qwen-mt-turbo --language "Simplified Chinese"
python3 make_book.py --book_name test_books/animal_farm.epub --qwen_key ${qwen_key} --model qwen-mt-plus --language "Japanese" --source_lang "English"
```
* 腾讯交互翻译
```shell

View File

@ -89,9 +89,11 @@ bbook --book_name test_books/animal_farm.epub --openai_key ${openai_key} --test
Support Alibaba Cloud [Qwen-MT](https://bailian.console.aliyun.com/) specialized translation model. Supports 92 languages with features like terminology intervention and translation memory.
Use `--model qwen-mt-turbo` for faster/cheaper translation, or `--model qwen-mt-plus` for higher quality.
Use `source_lang` to specify the source language explicitly, or leave it empty for auto-detection.
```shell
python3 make_book.py --book_name test_books/animal_farm.epub --model qwen-mt-turbo --language "Chinese"
python3 make_book.py --book_name test_books/animal_farm.epub --model qwen-mt-plus --language "Japanese" --source_lang "English"
python3 make_book.py --book_name test_books/animal_farm.epub --qwen_key ${qwen_key} --model qwen-mt-turbo --language "Simplified Chinese"
python3 make_book.py --book_name test_books/animal_farm.epub --qwen_key ${qwen_key} --model qwen-mt-plus --language "Japanese" --source_lang "English"
```
* [Tencent TranSmart](https://transmart.qq.com)

View File

@ -184,6 +184,14 @@ def main():
help="You can get xAI Key from https://console.x.ai/",
)
# for Qwen
parser.add_argument(
"--qwen_key",
dest="qwen_key",
type=str,
help="You can get Qwen Key from https://bailian.console.aliyun.com/?tab=model#/api-key",
)
parser.add_argument(
"--test",
dest="test",
@ -459,6 +467,8 @@ So you are close to reaching the limit. You have to choose your own value, there
API_KEY = options.groq_key or env.get("BBM_GROQ_API_KEY")
elif options.model == "xai":
API_KEY = options.xai_key or env.get("BBM_XAI_API_KEY")
elif options.model.startswith("qwen-"):
API_KEY = options.qwen_key or env.get("BBM_QWEN_API_KEY")
else:
API_KEY = ""
@ -579,7 +589,7 @@ So you are close to reaching the limit. You have to choose your own value, there
e.translate_model.set_o3mini_models()
if options.model.startswith("claude-"):
e.translate_model.set_claude_model(options.model)
if options.model.startswith("qwen"):
if options.model.startswith("qwen-"):
e.translate_model.set_qwen_model(options.model)
if options.block_size > 0:
e.block_size = options.block_size

View File

@ -37,6 +37,7 @@ MODEL_DICT = {
"tencentransmart": TencentTranSmart,
"customapi": CustomAPI,
"xai": XAIClient,
"qwen": QwenTranslator,
"qwen-mt-turbo": QwenTranslator,
"qwen-mt-plus": QwenTranslator,
# add more here

View File

@ -0,0 +1,263 @@
import re
import time
from os import environ
from rich import print
from openai import OpenAI
from .base_translator import Base
class QwenTranslator(Base):
"""
Qwen-MT translator using Alibaba Cloud's DashScope API
Specialized translation model supporting 92 languages with advanced features
Official documentation: https://help.aliyun.com/document_detail/2860790.html
Todo: support more languages, terminology, and domain hints
"""
# Language mapping from bilingual_book_maker format to Qwen language codes
LANGUAGE_MAP = {
# Common languages
"english": "English",
"chinese": "Chinese",
"simplified chinese": "Chinese",
"traditional chinese": "Traditional Chinese",
"japanese": "Japanese",
"korean": "Korean",
"spanish": "Spanish",
"french": "French",
"german": "German",
"portuguese": "Portuguese",
"italian": "Italian",
"russian": "Russian",
"arabic": "Arabic",
"hindi": "Hindi",
"thai": "Thai",
"vietnamese": "Vietnamese",
"indonesian": "Indonesian",
"malay": "Malay",
"dutch": "Dutch",
"turkish": "Turkish",
"polish": "Polish",
"czech": "Czech",
"hungarian": "Hungarian",
"romanian": "Romanian",
"greek": "Greek",
"hebrew": "Hebrew",
"finnish": "Finnish",
"danish": "Danish",
"swedish": "Swedish",
"norwegian": "Norwegian Bokmål",
"ukrainian": "Ukrainian",
"bulgarian": "Bulgarian",
"serbian": "Serbian",
"croatian": "Croatian",
"slovenian": "Slovenian",
"slovak": "Slovak",
"lithuanian": "Lithuanian",
"latvian": "Latvian",
"estonian": "Estonian",
# Add more mappings as needed
}
def __init__(
self,
key,
language,
model="qwen-mt-turbo",
source_lang="auto",
api_base=None,
prompt_template=None, # Not used for translation models
prompt_sys_msg=None, # Not used for translation models
temperature=None, # Not used for translation models
context_flag=False,
context_paragraph_limit=5,
terminology=None,
domain_hint=None,
**kwargs,
) -> None:
super().__init__(key, language)
# API configuration
self.api_base = api_base or "https://dashscope.aliyuncs.com/compatible-mode/v1"
self.client = OpenAI(
api_key=next(self.keys), base_url=self.api_base, timeout=60
)
# Model configuration
self.model = self.set_qwen_model(model)
self.source_lang = source_lang
self.target_lang = self._map_language(language)
# Advanced features
self.terminology = self.set_terminology(terminology)
self.domain_hint = self.set_domain_hint(domain_hint)
# Context/Translation memory support
self.context_flag = context_flag
self.context_list = []
self.context_translated_list = []
self.context_paragraph_limit = context_paragraph_limit
print(f"[bold blue]Qwen Translator initialized:[/bold blue]")
print(f" Model: {self.model}")
print(f" Source Language: {self.source_lang}")
print(f" Target Language: {self.target_lang}")
if self.domain_hint:
print(f" Domain Hint: {self.domain_hint}")
def rotate_key(self):
"""Rotate API key for load balancing"""
try:
self.client.api_key = next(self.keys)
except StopIteration:
pass
def _map_language(self, language):
"""Map language name to Qwen language format"""
language_lower = language.lower().strip()
# Direct mapping
if language_lower in self.LANGUAGE_MAP:
return self.LANGUAGE_MAP[language_lower]
# Try partial matching for common variations
for key, value in self.LANGUAGE_MAP.items():
if language_lower in key or key in language_lower:
return value
# Fallback to original language name with proper capitalization
return language.title()
def _create_translation_options(self):
"""Create translation options for the API request"""
options = {"source_lang": self.source_lang, "target_lang": self.target_lang}
# Add terminology if provided
if self.terminology and len(self.terminology) > 0:
options["terms"] = self.terminology
# Add domain hint if provided (must be in English)
if self.domain_hint and len(self.domain_hint) > 0:
options["domains"] = self.domain_hint
# Add translation memory if context is enabled
if self.context_flag and self.context_list:
tm_list = []
for src, tgt in zip(self.context_list, self.context_translated_list):
tm_list.append({"source": src, "target": tgt})
if tm_list:
options["tm_list"] = tm_list
return options
def save_context(self, text, t_text):
"""Save the current translation pair to context for translation memory"""
if not self.context_flag:
return
self.context_list.append(text)
self.context_translated_list.append(t_text)
# Keep only the most recent paragraphs within the limit
if len(self.context_list) > self.context_paragraph_limit:
self.context_list.pop(0)
self.context_translated_list.pop(0)
def translate(self, text, needprint=True):
"""Main translation method"""
start_time = time.time()
if needprint:
print(re.sub(r"\n{3,}", "\n\n", text))
attempt_count = 0
max_attempts = 3
t_text = ""
while attempt_count < max_attempts:
try:
self.rotate_key()
# Prepare messages
messages = [{"role": "user", "content": text}]
# Create translation options
translation_options = self._create_translation_options()
# Make API request
completion = self.client.chat.completions.create(
model=self.model,
messages=messages,
extra_body={"translation_options": translation_options},
)
# Extract translated text
if completion.choices[0].message.content:
t_text = completion.choices[0].message.content.strip()
else:
t_text = ""
# Save to context for translation memory
if self.context_flag and t_text:
self.save_context(text, t_text)
break
except Exception as e:
attempt_count += 1
print(
f"[red]Translation attempt {attempt_count} failed: {str(e)}[/red]"
)
if attempt_count >= max_attempts:
print(
f"[red]Translation failed after {max_attempts} attempts[/red]"
)
t_text = text # Fallback to original text
else:
time.sleep(1) # Wait before retry
if needprint:
print(f"[bold green]{re.sub(r'\n{3,}', '\n\n', t_text)}[/bold green]")
end_time = time.time()
print(f"[dim]Translation time: {end_time - start_time:.2f}s[/dim]")
return t_text
def set_terminology(self, terminology):
"""Set custom terminology for translation
Args:
terminology: List of dict with 'source' and 'target' keys
e.g., [{"source": "API", "target": "应用程序接口"}]
"""
self.terminology = terminology or []
print(f"[blue]Terminology updated: {len(self.terminology)} terms[/blue]")
def set_domain_hint(self, domain_hint):
"""Set domain hint for specialized translation
Args:
domain_hint: String describing the domain in English
e.g., "Technical documentation for software development"
"""
self.domain_hint = domain_hint or ""
print(f"[blue]Domain hint set: {self.domain_hint}[/blue]")
def set_qwen_model(self, model_name):
"""Set Qwen model type
Args:
model_name: Either "qwen-mt-turbo" or "qwen-mt-plus"
"""
if model_name in ["qwen-mt-turbo", "qwen-mt-plus"]:
self.model = model_name
print(f"[blue]Qwen model set to: {self.model}[/blue]")
else:
self.model = "qwen-mt-turbo"
print(
f"[red]Invalid Qwen model: {model_name}. Using default: {self.model}[/red]"
)