From 70a19628041020b0df1e016895d416a2b8a86a0d Mon Sep 17 00:00:00 2001
From: leslie <ly624928214@gmail.com>
Date: Sat, 19 Apr 2025 22:18:47 +0800
Subject: [PATCH] fix:Fix translation paragraph count mismatch by explicitly
 instructing LLM about paragraph requirements

---
 .../translator/chatgptapi_translator.py       | 167 ++++++++++++++----
 1 file changed, 129 insertions(+), 38 deletions(-)
diff --git a/book_maker/translator/chatgptapi_translator.py b/book_maker/translator/chatgptapi_translator.py
index 08fd226..2c946bd 100644
--- a/book_maker/translator/chatgptapi_translator.py
+++ b/book_maker/translator/chatgptapi_translator.py
@@ -240,29 +240,33 @@ class ChatGPTAPI(Base):
     ):
         if len(result_list) == plist_len:
             return result_list, 0
-
         best_result_list = result_list
         retry_count = 0
 
         # Save original prompt template
         original_prompt_template = self.prompt_template
-
         while retry_count < max_retries and len(result_list) != plist_len:
             print(
                 f"bug: {plist_len} -> {len(result_list)} : Number of paragraphs before and after translation",
             )
-            print(f"sleep for {sleep_dur}s and retry {retry_count+1} ...")
+            print(f"sleep for {sleep_dur}s and retry {retry_count + 1} ...")
             time.sleep(sleep_dur)
             retry_count += 1
 
-            # Make instructions increasingly explicit with each retry
-            emphasis = "!" * min(retry_count, 3)  # Add up to 3 exclamation marks
-            paragraph_instruction = f"IMPORTANT{emphasis} The text contains exactly {plist_len} numbered paragraphs. Your translation MUST maintain exactly {plist_len} paragraphs with the same numbering structure."
+            # Create increasingly strict prompts
+            structured_prompt = (
+                f"CRITICAL!!! Translate the following {plist_len} paragraphs to {{language}}. "
+                f"Your output MUST have EXACTLY {plist_len} paragraphs - NO MORE, NO LESS. "
+                f"Each paragraph must be wrapped in numbered XML tags: <p1>text</p1>, <p2>text</p2>, etc. "
+                f"DO NOT skip any paragraph numbers. DO NOT add extra paragraphs. "
+                f"Required format: <p1>translated text</p1>\n<p2>translated text</p2>\n...\n<p{plist_len}>translated text</p{plist_len}>"
+            )
 
-            # Extend the original prompt
-            self.prompt_template = f"{original_prompt_template} {paragraph_instruction}"
+            self.prompt_template = structured_prompt + " `{text}`"
+
+            translated_text = self.translate(new_str, False)
+            result_list = self.extract_tagged_paragraphs(translated_text, plist_len)
 
-            result_list = self.translate_and_split_lines(new_str)
             if (
                 len(result_list) == plist_len
                 or len(best_result_list) < len(result_list) <= plist_len
@@ -272,10 +276,18 @@ class ChatGPTAPI(Base):
                 )
             ):
                 best_result_list = result_list
-
         # Restore original prompt
         self.prompt_template = original_prompt_template
 
+        # If we still don't have the right number, force it by padding or trimming
+        if len(best_result_list) != plist_len:
+            if len(best_result_list) < plist_len:
+                # Pad with empty strings if we have too few
+                best_result_list.extend([""] * (plist_len - len(best_result_list)))
+            else:
+                # Trim if we have too many
+                best_result_list = best_result_list[:plist_len]
+
         return best_result_list, retry_count
 
     def log_retry(self, state, retry_count, elapsed_time, log_path="log/buglog.txt"):
@@ -347,54 +359,133 @@ class ChatGPTAPI(Base):
         return new_text
 
     def translate_list(self, plist):
-        sep = "\n\n\n\n\n"
         plist_len = len(plist)
 
-        new_str = ""
-        i = 1
-        for p in plist:
+        # Format input with explicit paragraph numbering
+        formatted_paragraphs = []
+        for i, p in enumerate(plist, 1):
             temp_p = copy(p)
             for sup in temp_p.find_all("sup"):
                 sup.extract()
-            new_str += f"({i}) {temp_p.get_text().strip()}{sep}"
-            i = i + 1
+            formatted_paragraphs.append(f"<p{i}>{temp_p.get_text().strip()}</p{i}>")
 
-        if new_str.endswith(sep):
-            new_str = new_str[: -len(sep)]
-
-        new_str = self.join_lines(new_str)
+        # Join with single newlines for cleaner input
+        new_str = "\n".join(formatted_paragraphs)
 
         print(f"plist len = {plist_len}")
 
-        # Preserve original prompt and append paragraph count requirements
+        # Save original prompt template
         original_prompt_template = self.prompt_template
-        self.prompt_template = f"{original_prompt_template} The text contains exactly {plist_len} paragraphs numbered as (1), (2), etc. Your translation MUST maintain exactly {plist_len} paragraphs with the same numbering."
 
-        # Translate with enhanced prompt
-        result_list = self.translate_and_split_lines(new_str)
+        # Create a structured prompt that forces exact paragraph count
+        structured_prompt = (
+            f"Translate the following {plist_len} paragraphs to {{language}}. "
+            f"CRUCIAL: Your output MUST contain EXACTLY {plist_len} paragraphs. "
+            f"Each paragraph is wrapped in numbered tags like <p1>text</p1>. "
+            f"Preserve these exact tags in your output, only translating the text inside them. "
+            f"Example output format: <p1>translated text for paragraph 1</p1>\n<p2>translated text for paragraph 2</p2>\n...\n<p{plist_len}>translated text for paragraph {plist_len}</p{plist_len}>"
+        )
+
+        self.prompt_template = structured_prompt + " `{text}`"
+
+        # First translation attempt
+        translated_text = self.translate(new_str, False)
+
+        # Extract paragraphs using the tags
+        result_list = self.extract_tagged_paragraphs(translated_text, plist_len)
+
+        # If we still don't have the right number, try the retry approach
+        start_time = time.time()
+        if len(result_list) != plist_len:
+            result_list, retry_count = self.get_best_result_list(
+                plist_len,
+                new_str,
+                6,  # WTF this magic number here?
+                result_list,
+            )
+        else:
+            retry_count = 0
+
+        end_time = time.time()
 
         # Restore original prompt
         self.prompt_template = original_prompt_template
 
-        start_time = time.time()
-
-        result_list, retry_count = self.get_best_result_list(
-            plist_len,
-            new_str,
-            6,  # WTF this magic number here?
-            result_list,
-        )
-
-        end_time = time.time()
-
         state = "fail" if len(result_list) != plist_len else "success"
         log_path = "log/buglog.txt"
 
         self.log_retry(state, retry_count, end_time - start_time, log_path)
-        self.log_translation_mismatch(plist_len, result_list, new_str, sep, log_path)
+        if state == "fail":
+            self.log_translation_mismatch(
+                plist_len, result_list, new_str, "\n", log_path
+            )
+
+        return result_list
+
+    def extract_tagged_paragraphs(self, text, plist_len):
+        """Extract paragraphs from text with <p1>...</p1> tags."""
+        result_list = []
+
+        # Try extracting with tags first
+        for i in range(1, plist_len + 1):
+            pattern = rf"<p{i}>(.*?)</p{i}>"
+            matches = re.findall(pattern, text, re.DOTALL)
+            if matches:
+                result_list.append(matches[0].strip())
+
+        # If we got all paragraphs, return them
+        if len(result_list) == plist_len:
+            return result_list
+
+        # Fallback: try general tag pattern
+        pattern = r"<p(\d+)>(.*?)</p\1>"
+        matches = re.findall(pattern, text, re.DOTALL)
+
+        if matches and len(matches) == plist_len:
+            # Sort by paragraph number
+            matches.sort(key=lambda x: int(x[0]))
+            result_list = [match[1].strip() for match in matches]
+            return result_list
+
+        # Second fallback: try another approach with numbered paragraphs
+        result_list = []
+        for i in range(1, plist_len + 1):
+            pattern = rf"\({i}\)\s*(.*?)(?=\s*\({i + 1}\)|\Z)"
+            match = re.search(pattern, text, re.DOTALL)
+            if match:
+                result_list.append(match.group(1).strip())
+
+        # If all else fails, fall back to splitting by lines
+        if len(result_list) != plist_len:
+            lines = text.splitlines()
+            non_empty_lines = [line.strip() for line in lines if line.strip()]
+
+            # Attempt to find paragraph markers and divide accordingly
+            paragraph_markers = [
+                i
+                for i, line in enumerate(non_empty_lines)
+                if re.match(r"^\s*(\(\d+\)|\d+\.)", line)
+            ]
+
+            if len(paragraph_markers) == plist_len:
+                result_list = []
+                for i in range(len(paragraph_markers)):
+                    start = paragraph_markers[i]
+                    end = (
+                        paragraph_markers[i + 1]
+                        if i < len(paragraph_markers) - 1
+                        else len(non_empty_lines)
+                    )
+                    paragraph = " ".join(non_empty_lines[start:end])
+                    result_list.append(re.sub(r"^\s*(\(\d+\)|\d+\.)\s*", "", paragraph))
+            else:
+                # Last resort: try to split evenly
+                result_list = (
+                    non_empty_lines[:plist_len]
+                    if len(non_empty_lines) >= plist_len
+                    else non_empty_lines
+                )
 
-        # Remove paragraph numbers from the result
-        result_list = [re.sub(r"^(\(\d+\)|\d+\.|(\d+))\s*", "", s) for s in result_list]
         return result_list
 
     def extract_paragraphs(self, text, paragraph_count):