finalize

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
2025-07-29 06:11:44 +00:00 · 2025-07-26 17:16:05 -07:00 · 2025-07-26 17:16:05 -07:00 · e540d9d6dc
commit e540d9d6dc
parent 9579ac20b9
5 changed files with 290 additions and 3 deletions
--- a/README-CN.md
+++ b/README-CN.md
@ -71,6 +71,17 @@ bbook --book_name test_books/animal_farm.epub --openai_key ${openai_key} --test
  python3 make_book.py --book_name test_books/animal_farm.epub --model gemini --gemini_key ${gemini_key}
  ```

+* Qwen
+
+  使用 [Qwen](https://www.aliyun.com/product/dashscope) 模型进行翻译，支持 qwen-mt-turbo 和 qwen-mt-plus 模型。
+
+  使用 `--source_lang` 指定源语言，留空为自动检测。
+
+  ```shell
+  python3 make_book.py --book_name test_books/animal_farm.epub --qwen_key ${qwen_key} --model qwen-mt-turbo --language "Simplified Chinese"
+  python3 make_book.py --book_name test_books/animal_farm.epub --qwen_key ${qwen_key} --model qwen-mt-plus --language "Japanese" --source_lang "English"
+  ```
+
 * 腾讯交互翻译

  ```shell
--- a/README.md
+++ b/README.md
@ -89,9 +89,11 @@ bbook --book_name test_books/animal_farm.epub --openai_key ${openai_key} --test
  Support Alibaba Cloud [Qwen-MT](https://bailian.console.aliyun.com/) specialized translation model. Supports 92 languages with features like terminology intervention and translation memory.
  Use `--model qwen-mt-turbo` for faster/cheaper translation, or `--model qwen-mt-plus` for higher quality.

+  Use `source_lang` to specify the source language explicitly, or leave it empty for auto-detection.
+
  ```shell
-  python3 make_book.py --book_name test_books/animal_farm.epub --model qwen-mt-turbo --language "Chinese"
-  python3 make_book.py --book_name test_books/animal_farm.epub --model qwen-mt-plus --language "Japanese" --source_lang "English"
+  python3 make_book.py --book_name test_books/animal_farm.epub --qwen_key ${qwen_key} --model qwen-mt-turbo --language "Simplified Chinese"
+  python3 make_book.py --book_name test_books/animal_farm.epub --qwen_key ${qwen_key} --model qwen-mt-plus --language "Japanese" --source_lang "English"
  ```

 * [Tencent TranSmart](https://transmart.qq.com)
--- a/book_maker/cli.py
+++ b/book_maker/cli.py
@ -184,6 +184,14 @@ def main():
        help="You can get xAI Key from  https://console.x.ai/",
    )

+    # for Qwen
+    parser.add_argument(
+        "--qwen_key",
+        dest="qwen_key",
+        type=str,
+        help="You can get Qwen Key from  https://bailian.console.aliyun.com/?tab=model#/api-key",
+    )
+
    parser.add_argument(
        "--test",
        dest="test",
@ -459,6 +467,8 @@ So you are close to reaching the limit. You have to choose your own value, there
        API_KEY = options.groq_key or env.get("BBM_GROQ_API_KEY")
    elif options.model == "xai":
        API_KEY = options.xai_key or env.get("BBM_XAI_API_KEY")
+    elif options.model.startswith("qwen-"):
+        API_KEY = options.qwen_key or env.get("BBM_QWEN_API_KEY")
    else:
        API_KEY = ""

@ -579,7 +589,7 @@ So you are close to reaching the limit. You have to choose your own value, there
        e.translate_model.set_o3mini_models()
    if options.model.startswith("claude-"):
        e.translate_model.set_claude_model(options.model)
-    if options.model.startswith("qwen"):
+    if options.model.startswith("qwen-"):
        e.translate_model.set_qwen_model(options.model)
    if options.block_size > 0:
        e.block_size = options.block_size
--- a/book_maker/translator/init.py
+++ b/book_maker/translator/init.py
@ -37,6 +37,7 @@ MODEL_DICT = {
    "tencentransmart": TencentTranSmart,
    "customapi": CustomAPI,
    "xai": XAIClient,
+    "qwen": QwenTranslator,
    "qwen-mt-turbo": QwenTranslator,
    "qwen-mt-plus": QwenTranslator,
    # add more here
--- a/book_maker/translator/qwen_translator.py
+++ b/book_maker/translator/qwen_translator.py
@ -0,0 +1,263 @@
+import re
+import time
+from os import environ
+from rich import print
+from openai import OpenAI
+
+from .base_translator import Base
+
+
+class QwenTranslator(Base):
+    """
+    Qwen-MT translator using Alibaba Cloud's DashScope API
+    Specialized translation model supporting 92 languages with advanced features
+    Official documentation: https://help.aliyun.com/document_detail/2860790.html
+
+    Todo: support more languages, terminology, and domain hints
+    """
+
+    # Language mapping from bilingual_book_maker format to Qwen language codes
+    LANGUAGE_MAP = {
+        # Common languages
+        "english": "English",
+        "chinese": "Chinese",
+        "simplified chinese": "Chinese",
+        "traditional chinese": "Traditional Chinese",
+        "japanese": "Japanese",
+        "korean": "Korean",
+        "spanish": "Spanish",
+        "french": "French",
+        "german": "German",
+        "portuguese": "Portuguese",
+        "italian": "Italian",
+        "russian": "Russian",
+        "arabic": "Arabic",
+        "hindi": "Hindi",
+        "thai": "Thai",
+        "vietnamese": "Vietnamese",
+        "indonesian": "Indonesian",
+        "malay": "Malay",
+        "dutch": "Dutch",
+        "turkish": "Turkish",
+        "polish": "Polish",
+        "czech": "Czech",
+        "hungarian": "Hungarian",
+        "romanian": "Romanian",
+        "greek": "Greek",
+        "hebrew": "Hebrew",
+        "finnish": "Finnish",
+        "danish": "Danish",
+        "swedish": "Swedish",
+        "norwegian": "Norwegian Bokmål",
+        "ukrainian": "Ukrainian",
+        "bulgarian": "Bulgarian",
+        "serbian": "Serbian",
+        "croatian": "Croatian",
+        "slovenian": "Slovenian",
+        "slovak": "Slovak",
+        "lithuanian": "Lithuanian",
+        "latvian": "Latvian",
+        "estonian": "Estonian",
+        # Add more mappings as needed
+    }
+
+    def __init__(
+        self,
+        key,
+        language,
+        model="qwen-mt-turbo",
+        source_lang="auto",
+        api_base=None,
+        prompt_template=None,  # Not used for translation models
+        prompt_sys_msg=None,  # Not used for translation models
+        temperature=None,  # Not used for translation models
+        context_flag=False,
+        context_paragraph_limit=5,
+        terminology=None,
+        domain_hint=None,
+        **kwargs,
+    ) -> None:
+        super().__init__(key, language)
+
+        # API configuration
+        self.api_base = api_base or "https://dashscope.aliyuncs.com/compatible-mode/v1"
+        self.client = OpenAI(
+            api_key=next(self.keys), base_url=self.api_base, timeout=60
+        )
+
+        # Model configuration
+        self.model = self.set_qwen_model(model)
+        self.source_lang = source_lang
+        self.target_lang = self._map_language(language)
+
+        # Advanced features
+        self.terminology = self.set_terminology(terminology)
+        self.domain_hint = self.set_domain_hint(domain_hint)
+
+        # Context/Translation memory support
+        self.context_flag = context_flag
+        self.context_list = []
+        self.context_translated_list = []
+        self.context_paragraph_limit = context_paragraph_limit
+
+        print(f"[bold blue]Qwen Translator initialized:[/bold blue]")
+        print(f"  Model: {self.model}")
+        print(f"  Source Language: {self.source_lang}")
+        print(f"  Target Language: {self.target_lang}")
+        if self.domain_hint:
+            print(f"  Domain Hint: {self.domain_hint}")
+
+    def rotate_key(self):
+        """Rotate API key for load balancing"""
+        try:
+            self.client.api_key = next(self.keys)
+        except StopIteration:
+            pass
+
+    def _map_language(self, language):
+        """Map language name to Qwen language format"""
+        language_lower = language.lower().strip()
+
+        # Direct mapping
+        if language_lower in self.LANGUAGE_MAP:
+            return self.LANGUAGE_MAP[language_lower]
+
+        # Try partial matching for common variations
+        for key, value in self.LANGUAGE_MAP.items():
+            if language_lower in key or key in language_lower:
+                return value
+
+        # Fallback to original language name with proper capitalization
+        return language.title()
+
+    def _create_translation_options(self):
+        """Create translation options for the API request"""
+        options = {"source_lang": self.source_lang, "target_lang": self.target_lang}
+
+        # Add terminology if provided
+        if self.terminology and len(self.terminology) > 0:
+            options["terms"] = self.terminology
+
+        # Add domain hint if provided (must be in English)
+        if self.domain_hint and len(self.domain_hint) > 0:
+            options["domains"] = self.domain_hint
+
+        # Add translation memory if context is enabled
+        if self.context_flag and self.context_list:
+            tm_list = []
+            for src, tgt in zip(self.context_list, self.context_translated_list):
+                tm_list.append({"source": src, "target": tgt})
+            if tm_list:
+                options["tm_list"] = tm_list
+
+        return options
+
+    def save_context(self, text, t_text):
+        """Save the current translation pair to context for translation memory"""
+        if not self.context_flag:
+            return
+
+        self.context_list.append(text)
+        self.context_translated_list.append(t_text)
+
+        # Keep only the most recent paragraphs within the limit
+        if len(self.context_list) > self.context_paragraph_limit:
+            self.context_list.pop(0)
+            self.context_translated_list.pop(0)
+
+    def translate(self, text, needprint=True):
+        """Main translation method"""
+        start_time = time.time()
+
+        if needprint:
+            print(re.sub(r"\n{3,}", "\n\n", text))
+
+        attempt_count = 0
+        max_attempts = 3
+        t_text = ""
+
+        while attempt_count < max_attempts:
+            try:
+                self.rotate_key()
+
+                # Prepare messages
+                messages = [{"role": "user", "content": text}]
+
+                # Create translation options
+                translation_options = self._create_translation_options()
+
+                # Make API request
+                completion = self.client.chat.completions.create(
+                    model=self.model,
+                    messages=messages,
+                    extra_body={"translation_options": translation_options},
+                )
+
+                # Extract translated text
+                if completion.choices[0].message.content:
+                    t_text = completion.choices[0].message.content.strip()
+                else:
+                    t_text = ""
+
+                # Save to context for translation memory
+                if self.context_flag and t_text:
+                    self.save_context(text, t_text)
+
+                break
+
+            except Exception as e:
+                attempt_count += 1
+                print(
+                    f"[red]Translation attempt {attempt_count} failed: {str(e)}[/red]"
+                )
+
+                if attempt_count >= max_attempts:
+                    print(
+                        f"[red]Translation failed after {max_attempts} attempts[/red]"
+                    )
+                    t_text = text  # Fallback to original text
+                else:
+                    time.sleep(1)  # Wait before retry
+
+        if needprint:
+            print(f"[bold green]{re.sub(r'\n{3,}', '\n\n', t_text)}[/bold green]")
+
+        end_time = time.time()
+        print(f"[dim]Translation time: {end_time - start_time:.2f}s[/dim]")
+
+        return t_text
+
+    def set_terminology(self, terminology):
+        """Set custom terminology for translation
+
+        Args:
+            terminology: List of dict with 'source' and 'target' keys
+                        e.g., [{"source": "API", "target": "应用程序接口"}]
+        """
+        self.terminology = terminology or []
+        print(f"[blue]Terminology updated: {len(self.terminology)} terms[/blue]")
+
+    def set_domain_hint(self, domain_hint):
+        """Set domain hint for specialized translation
+
+        Args:
+            domain_hint: String describing the domain in English
+                        e.g., "Technical documentation for software development"
+        """
+        self.domain_hint = domain_hint or ""
+        print(f"[blue]Domain hint set: {self.domain_hint}[/blue]")
+
+    def set_qwen_model(self, model_name):
+        """Set Qwen model type
+
+        Args:
+            model_name: Either "qwen-mt-turbo" or "qwen-mt-plus"
+        """
+        if model_name in ["qwen-mt-turbo", "qwen-mt-plus"]:
+            self.model = model_name
+            print(f"[blue]Qwen model set to: {self.model}[/blue]")
+        else:
+            self.model = "qwen-mt-turbo"
+            print(
+                f"[red]Invalid Qwen model: {model_name}. Using default: {self.model}[/red]"
+            )