fix:Fix translation paragraph count mismatch by explicitly instructing LLM about paragraph requirements

2025-07-18 08:00:07 +00:00 · 2025-04-19 20:31:22 +08:00 · 2025-04-19 20:31:22 +08:00 · 83303d1dd8
commit 83303d1dd8
parent b0dbed8826
1 changed files with 5 additions and 8 deletions
--- a/book_maker/translator/chatgptapi_translator.py
+++ b/book_maker/translator/chatgptapi_translator.py
@ -256,8 +256,7 @@ class ChatGPTAPI(Base):
            retry_count += 1

            # Make instructions increasingly explicit with each retry
-            emphasis = "!" * min(retry_count,
-                                 3)  # Add up to 3 exclamation marks
+            emphasis = "!" * min(retry_count, 3)  # Add up to 3 exclamation marks
            paragraph_instruction = f"IMPORTANT{emphasis} The text contains exactly {plist_len} numbered paragraphs. Your translation MUST maintain exactly {plist_len} paragraphs with the same numbering structure."

            # Extend the original prompt
@ -392,12 +391,10 @@ class ChatGPTAPI(Base):
        log_path = "log/buglog.txt"

        self.log_retry(state, retry_count, end_time - start_time, log_path)
-        self.log_translation_mismatch(plist_len, result_list, new_str, sep,
-                                      log_path)
+        self.log_translation_mismatch(plist_len, result_list, new_str, sep, log_path)

        # Remove paragraph numbers from the result
-        result_list = [re.sub(r"^(\(\d+\)|\d+\.|(\d+))\s*", "", s) for s in
-                       result_list]
+        result_list = [re.sub(r"^(\(\d+\)|\d+\.|(\d+))\s*", "", s) for s in result_list]
        return result_list

    def extract_paragraphs(self, text, paragraph_count):
@ -405,14 +402,14 @@ class ChatGPTAPI(Base):
        # First try to extract by paragraph numbers (1), (2), etc.
        result_list = []
        for i in range(1, paragraph_count + 1):
-            pattern = rf'\({i}\)\s*(.*?)(?=\s*\({i + 1}\)|\Z)'
+            pattern = rf"\({i}\)\s*(.*?)(?=\s*\({i + 1}\)|\Z)"
            match = re.search(pattern, text, re.DOTALL)
            if match:
                result_list.append(match.group(1).strip())

        # If exact pattern matching failed, try another approach
        if len(result_list) != paragraph_count:
-            pattern = r'\((\d+)\)\s*(.*?)(?=\s*\(\d+\)|\Z)'
+            pattern = r"\((\d+)\)\s*(.*?)(?=\s*\(\d+\)|\Z)"
            matches = re.findall(pattern, text, re.DOTALL)
            if matches:
                # Sort by paragraph number