From 6685b2399336111e710cd45d64a487420bf7b210 Mon Sep 17 00:00:00 2001 From: leslie Date: Sat, 19 Apr 2025 19:59:32 +0800 Subject: [PATCH 1/9] fix:Fix translation paragraph count mismatch by explicitly instructing LLM about paragraph requirements --- .../translator/chatgptapi_translator.py | 93 ++++++++++++++----- 1 file changed, 71 insertions(+), 22 deletions(-) diff --git a/book_maker/translator/chatgptapi_translator.py b/book_maker/translator/chatgptapi_translator.py index 47fbba6..bfccabe 100644 --- a/book_maker/translator/chatgptapi_translator.py +++ b/book_maker/translator/chatgptapi_translator.py @@ -75,7 +75,7 @@ class ChatGPTAPI(Base): api_base=None, prompt_template=None, prompt_sys_msg=None, - temperature=1.0, + temperature=0.2, context_flag=False, context_paragraph_limit=0, **kwargs, @@ -155,6 +155,7 @@ class ChatGPTAPI(Base): model=self.model, messages=messages, temperature=self.temperature, + top_p=0.1 ) return completion @@ -240,28 +241,39 @@ class ChatGPTAPI(Base): ): if len(result_list) == plist_len: return result_list, 0 - best_result_list = result_list retry_count = 0 - + # Save the original templates + original_prompt_template = self.prompt_template + original_system_content = self.system_content while retry_count < max_retries and len(result_list) != plist_len: print( f"bug: {plist_len} -> {len(result_list)} : Number of paragraphs before and after translation", ) - print(f"sleep for {sleep_dur}s and retry {retry_count+1} ...") + print(f"sleep for {sleep_dur}s and retry {retry_count + 1} ...") time.sleep(sleep_dur) retry_count += 1 - result_list = self.translate_and_split_lines(new_str) + + # Use increasingly forceful prompts on retries + self.prompt_template = f"Translate the following text to {{language}}. IMPORTANT: The text has EXACTLY {plist_len} numbered paragraphs. Your translation MUST have EXACTLY {plist_len} paragraphs with the same numbering (1), (2), etc. `{{text}}`" + self.system_content = f"You are a precise translator. The text contains {plist_len} paragraphs. Your output MUST contain exactly {plist_len} paragraphs, no more and no less." + + # Try again with modified instruction + result_str = self.translate(new_str, False) + result_list = self.extract_paragraphs(result_str, plist_len) + if ( len(result_list) == plist_len or len(best_result_list) < len(result_list) <= plist_len or ( - len(result_list) < len(best_result_list) - and len(best_result_list) > plist_len - ) + len(result_list) < len(best_result_list) + and len(best_result_list) > plist_len + ) ): best_result_list = result_list - + # Restore the original templates + self.prompt_template = original_prompt_template + self.system_content = original_system_content return best_result_list, retry_count def log_retry(self, state, retry_count, elapsed_time, log_path="log/buglog.txt"): @@ -334,8 +346,9 @@ class ChatGPTAPI(Base): def translate_list(self, plist): sep = "\n\n\n\n\n" - # new_str = sep.join([item.text for item in plist]) + plist_len = len(plist) + # Construct the text to be translated new_str = "" i = 1 for p in plist: @@ -347,34 +360,70 @@ class ChatGPTAPI(Base): if new_str.endswith(sep): new_str = new_str[: -len(sep)] - new_str = self.join_lines(new_str) - plist_len = len(plist) + print(f"plist len = {plist_len}") - print(f"plist len = {len(plist)}") + # Save the original prompt template and system message + original_prompt_template = self.prompt_template + original_system_content = self.system_content - result_list = self.translate_and_split_lines(new_str) + # Modify the prompt template and system message to include paragraph count requirement + self.prompt_template = f"Please translate the following {plist_len} numbered paragraphs to {{language}}. Ensure your translation maintains exactly {plist_len} paragraphs and preserves the paragraph numbers. `{{text}}`" + self.system_content = f"You are a translator. The text contains {plist_len} numbered paragraphs. Your translation must have exactly {plist_len} paragraphs with the same numbering structure." + + # Translate with explicit paragraph count instruction + result_str = self.translate(new_str, False) + + # Extract paragraphs with a robust strategy + result_list = self.extract_paragraphs(result_str, plist_len) + + # Restore original templates + self.prompt_template = original_prompt_template + self.system_content = original_system_content start_time = time.time() - result_list, retry_count = self.get_best_result_list( plist_len, new_str, - 6, # WTF this magic number here? + 6, result_list, ) - end_time = time.time() - state = "fail" if len(result_list) != plist_len else "success" log_path = "log/buglog.txt" - self.log_retry(state, retry_count, end_time - start_time, log_path) - self.log_translation_mismatch(plist_len, result_list, new_str, sep, log_path) + self.log_translation_mismatch(plist_len, result_list, new_str, sep, + log_path) + # Remove paragraph numbers from the result + result_list = [re.sub(r"^(\(\d+\)|\d+\.|(\d+))\s*", "", s) for s in + result_list] + return result_list + + def extract_paragraphs(self, text, paragraph_count): + """Extract paragraphs from translated text, ensuring paragraph count is preserved.""" + # First try to extract by paragraph numbers (1), (2), etc. + result_list = [] + for i in range(1, paragraph_count + 1): + pattern = rf'\({i}\)\s*(.*?)(?=\s*\({i + 1}\)|\Z)' + match = re.search(pattern, text, re.DOTALL) + if match: + result_list.append(match.group(1).strip()) + + # If exact pattern matching failed, try another approach + if len(result_list) != paragraph_count: + pattern = r'\((\d+)\)\s*(.*?)(?=\s*\(\d+\)|\Z)' + matches = re.findall(pattern, text, re.DOTALL) + if matches: + # Sort by paragraph number + matches.sort(key=lambda x: int(x[0])) + result_list = [match[1].strip() for match in matches] + + # Fallback to original line-splitting approach + if len(result_list) != paragraph_count: + lines = text.splitlines() + result_list = [line.strip() for line in lines if line.strip() != ""] - # del (num), num. sometime (num) will translated to num. - result_list = [re.sub(r"^(\(\d+\)|\d+\.|(\d+))\s*", "", s) for s in result_list] return result_list def set_deployment_id(self, deployment_id): From b0dbed8826fe01c66c4d80ed2d7ef2a7c4d5a78a Mon Sep 17 00:00:00 2001 From: leslie Date: Sat, 19 Apr 2025 20:15:56 +0800 Subject: [PATCH 2/9] fix:Fix translation paragraph count mismatch by explicitly instructing LLM about paragraph requirements --- .../translator/chatgptapi_translator.py | 62 +++++++++---------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/book_maker/translator/chatgptapi_translator.py b/book_maker/translator/chatgptapi_translator.py index bfccabe..b1a6585 100644 --- a/book_maker/translator/chatgptapi_translator.py +++ b/book_maker/translator/chatgptapi_translator.py @@ -75,7 +75,7 @@ class ChatGPTAPI(Base): api_base=None, prompt_template=None, prompt_sys_msg=None, - temperature=0.2, + temperature=1.0, context_flag=False, context_paragraph_limit=0, **kwargs, @@ -155,7 +155,6 @@ class ChatGPTAPI(Base): model=self.model, messages=messages, temperature=self.temperature, - top_p=0.1 ) return completion @@ -241,39 +240,43 @@ class ChatGPTAPI(Base): ): if len(result_list) == plist_len: return result_list, 0 + best_result_list = result_list retry_count = 0 - # Save the original templates + + # Save original prompt template original_prompt_template = self.prompt_template - original_system_content = self.system_content + while retry_count < max_retries and len(result_list) != plist_len: print( f"bug: {plist_len} -> {len(result_list)} : Number of paragraphs before and after translation", ) - print(f"sleep for {sleep_dur}s and retry {retry_count + 1} ...") + print(f"sleep for {sleep_dur}s and retry {retry_count+1} ...") time.sleep(sleep_dur) retry_count += 1 - # Use increasingly forceful prompts on retries - self.prompt_template = f"Translate the following text to {{language}}. IMPORTANT: The text has EXACTLY {plist_len} numbered paragraphs. Your translation MUST have EXACTLY {plist_len} paragraphs with the same numbering (1), (2), etc. `{{text}}`" - self.system_content = f"You are a precise translator. The text contains {plist_len} paragraphs. Your output MUST contain exactly {plist_len} paragraphs, no more and no less." + # Make instructions increasingly explicit with each retry + emphasis = "!" * min(retry_count, + 3) # Add up to 3 exclamation marks + paragraph_instruction = f"IMPORTANT{emphasis} The text contains exactly {plist_len} numbered paragraphs. Your translation MUST maintain exactly {plist_len} paragraphs with the same numbering structure." - # Try again with modified instruction - result_str = self.translate(new_str, False) - result_list = self.extract_paragraphs(result_str, plist_len) + # Extend the original prompt + self.prompt_template = f"{original_prompt_template} {paragraph_instruction}" + result_list = self.translate_and_split_lines(new_str) if ( len(result_list) == plist_len or len(best_result_list) < len(result_list) <= plist_len or ( - len(result_list) < len(best_result_list) - and len(best_result_list) > plist_len - ) + len(result_list) < len(best_result_list) + and len(best_result_list) > plist_len + ) ): best_result_list = result_list - # Restore the original templates + + # Restore original prompt self.prompt_template = original_prompt_template - self.system_content = original_system_content + return best_result_list, retry_count def log_retry(self, state, retry_count, elapsed_time, log_path="log/buglog.txt"): @@ -348,7 +351,6 @@ class ChatGPTAPI(Base): sep = "\n\n\n\n\n" plist_len = len(plist) - # Construct the text to be translated new_str = "" i = 1 for p in plist: @@ -360,41 +362,39 @@ class ChatGPTAPI(Base): if new_str.endswith(sep): new_str = new_str[: -len(sep)] + new_str = self.join_lines(new_str) print(f"plist len = {plist_len}") - # Save the original prompt template and system message + # Preserve original prompt and append paragraph count requirements original_prompt_template = self.prompt_template - original_system_content = self.system_content + self.prompt_template = f"{original_prompt_template} The text contains exactly {plist_len} paragraphs numbered as (1), (2), etc. Your translation MUST maintain exactly {plist_len} paragraphs with the same numbering." - # Modify the prompt template and system message to include paragraph count requirement - self.prompt_template = f"Please translate the following {plist_len} numbered paragraphs to {{language}}. Ensure your translation maintains exactly {plist_len} paragraphs and preserves the paragraph numbers. `{{text}}`" - self.system_content = f"You are a translator. The text contains {plist_len} numbered paragraphs. Your translation must have exactly {plist_len} paragraphs with the same numbering structure." + # Translate with enhanced prompt + result_list = self.translate_and_split_lines(new_str) - # Translate with explicit paragraph count instruction - result_str = self.translate(new_str, False) - - # Extract paragraphs with a robust strategy - result_list = self.extract_paragraphs(result_str, plist_len) - - # Restore original templates + # Restore original prompt self.prompt_template = original_prompt_template - self.system_content = original_system_content start_time = time.time() + result_list, retry_count = self.get_best_result_list( plist_len, new_str, - 6, + 6, # WTF this magic number here? result_list, ) + end_time = time.time() + state = "fail" if len(result_list) != plist_len else "success" log_path = "log/buglog.txt" + self.log_retry(state, retry_count, end_time - start_time, log_path) self.log_translation_mismatch(plist_len, result_list, new_str, sep, log_path) + # Remove paragraph numbers from the result result_list = [re.sub(r"^(\(\d+\)|\d+\.|(\d+))\s*", "", s) for s in result_list] From 83303d1dd844a5997f90bc117db129eeb2312a75 Mon Sep 17 00:00:00 2001 From: leslie Date: Sat, 19 Apr 2025 20:31:22 +0800 Subject: [PATCH 3/9] fix:Fix translation paragraph count mismatch by explicitly instructing LLM about paragraph requirements --- book_maker/translator/chatgptapi_translator.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/book_maker/translator/chatgptapi_translator.py b/book_maker/translator/chatgptapi_translator.py index b1a6585..08fd226 100644 --- a/book_maker/translator/chatgptapi_translator.py +++ b/book_maker/translator/chatgptapi_translator.py @@ -256,8 +256,7 @@ class ChatGPTAPI(Base): retry_count += 1 # Make instructions increasingly explicit with each retry - emphasis = "!" * min(retry_count, - 3) # Add up to 3 exclamation marks + emphasis = "!" * min(retry_count, 3) # Add up to 3 exclamation marks paragraph_instruction = f"IMPORTANT{emphasis} The text contains exactly {plist_len} numbered paragraphs. Your translation MUST maintain exactly {plist_len} paragraphs with the same numbering structure." # Extend the original prompt @@ -392,12 +391,10 @@ class ChatGPTAPI(Base): log_path = "log/buglog.txt" self.log_retry(state, retry_count, end_time - start_time, log_path) - self.log_translation_mismatch(plist_len, result_list, new_str, sep, - log_path) + self.log_translation_mismatch(plist_len, result_list, new_str, sep, log_path) # Remove paragraph numbers from the result - result_list = [re.sub(r"^(\(\d+\)|\d+\.|(\d+))\s*", "", s) for s in - result_list] + result_list = [re.sub(r"^(\(\d+\)|\d+\.|(\d+))\s*", "", s) for s in result_list] return result_list def extract_paragraphs(self, text, paragraph_count): @@ -405,14 +402,14 @@ class ChatGPTAPI(Base): # First try to extract by paragraph numbers (1), (2), etc. result_list = [] for i in range(1, paragraph_count + 1): - pattern = rf'\({i}\)\s*(.*?)(?=\s*\({i + 1}\)|\Z)' + pattern = rf"\({i}\)\s*(.*?)(?=\s*\({i + 1}\)|\Z)" match = re.search(pattern, text, re.DOTALL) if match: result_list.append(match.group(1).strip()) # If exact pattern matching failed, try another approach if len(result_list) != paragraph_count: - pattern = r'\((\d+)\)\s*(.*?)(?=\s*\(\d+\)|\Z)' + pattern = r"\((\d+)\)\s*(.*?)(?=\s*\(\d+\)|\Z)" matches = re.findall(pattern, text, re.DOTALL) if matches: # Sort by paragraph number From 70a19628041020b0df1e016895d416a2b8a86a0d Mon Sep 17 00:00:00 2001 From: leslie Date: Sat, 19 Apr 2025 22:18:47 +0800 Subject: [PATCH 4/9] fix:Fix translation paragraph count mismatch by explicitly instructing LLM about paragraph requirements --- .../translator/chatgptapi_translator.py | 167 ++++++++++++++---- 1 file changed, 129 insertions(+), 38 deletions(-) diff --git a/book_maker/translator/chatgptapi_translator.py b/book_maker/translator/chatgptapi_translator.py index 08fd226..2c946bd 100644 --- a/book_maker/translator/chatgptapi_translator.py +++ b/book_maker/translator/chatgptapi_translator.py @@ -240,29 +240,33 @@ class ChatGPTAPI(Base): ): if len(result_list) == plist_len: return result_list, 0 - best_result_list = result_list retry_count = 0 # Save original prompt template original_prompt_template = self.prompt_template - while retry_count < max_retries and len(result_list) != plist_len: print( f"bug: {plist_len} -> {len(result_list)} : Number of paragraphs before and after translation", ) - print(f"sleep for {sleep_dur}s and retry {retry_count+1} ...") + print(f"sleep for {sleep_dur}s and retry {retry_count + 1} ...") time.sleep(sleep_dur) retry_count += 1 - # Make instructions increasingly explicit with each retry - emphasis = "!" * min(retry_count, 3) # Add up to 3 exclamation marks - paragraph_instruction = f"IMPORTANT{emphasis} The text contains exactly {plist_len} numbered paragraphs. Your translation MUST maintain exactly {plist_len} paragraphs with the same numbering structure." + # Create increasingly strict prompts + structured_prompt = ( + f"CRITICAL!!! Translate the following {plist_len} paragraphs to {{language}}. " + f"Your output MUST have EXACTLY {plist_len} paragraphs - NO MORE, NO LESS. " + f"Each paragraph must be wrapped in numbered XML tags: text, text, etc. " + f"DO NOT skip any paragraph numbers. DO NOT add extra paragraphs. " + f"Required format: translated text\ntranslated text\n...\ntranslated text" + ) - # Extend the original prompt - self.prompt_template = f"{original_prompt_template} {paragraph_instruction}" + self.prompt_template = structured_prompt + " `{text}`" + + translated_text = self.translate(new_str, False) + result_list = self.extract_tagged_paragraphs(translated_text, plist_len) - result_list = self.translate_and_split_lines(new_str) if ( len(result_list) == plist_len or len(best_result_list) < len(result_list) <= plist_len @@ -272,10 +276,18 @@ class ChatGPTAPI(Base): ) ): best_result_list = result_list - # Restore original prompt self.prompt_template = original_prompt_template + # If we still don't have the right number, force it by padding or trimming + if len(best_result_list) != plist_len: + if len(best_result_list) < plist_len: + # Pad with empty strings if we have too few + best_result_list.extend([""] * (plist_len - len(best_result_list))) + else: + # Trim if we have too many + best_result_list = best_result_list[:plist_len] + return best_result_list, retry_count def log_retry(self, state, retry_count, elapsed_time, log_path="log/buglog.txt"): @@ -347,54 +359,133 @@ class ChatGPTAPI(Base): return new_text def translate_list(self, plist): - sep = "\n\n\n\n\n" plist_len = len(plist) - new_str = "" - i = 1 - for p in plist: + # Format input with explicit paragraph numbering + formatted_paragraphs = [] + for i, p in enumerate(plist, 1): temp_p = copy(p) for sup in temp_p.find_all("sup"): sup.extract() - new_str += f"({i}) {temp_p.get_text().strip()}{sep}" - i = i + 1 + formatted_paragraphs.append(f"{temp_p.get_text().strip()}") - if new_str.endswith(sep): - new_str = new_str[: -len(sep)] - - new_str = self.join_lines(new_str) + # Join with single newlines for cleaner input + new_str = "\n".join(formatted_paragraphs) print(f"plist len = {plist_len}") - # Preserve original prompt and append paragraph count requirements + # Save original prompt template original_prompt_template = self.prompt_template - self.prompt_template = f"{original_prompt_template} The text contains exactly {plist_len} paragraphs numbered as (1), (2), etc. Your translation MUST maintain exactly {plist_len} paragraphs with the same numbering." - # Translate with enhanced prompt - result_list = self.translate_and_split_lines(new_str) + # Create a structured prompt that forces exact paragraph count + structured_prompt = ( + f"Translate the following {plist_len} paragraphs to {{language}}. " + f"CRUCIAL: Your output MUST contain EXACTLY {plist_len} paragraphs. " + f"Each paragraph is wrapped in numbered tags like text. " + f"Preserve these exact tags in your output, only translating the text inside them. " + f"Example output format: translated text for paragraph 1\ntranslated text for paragraph 2\n...\ntranslated text for paragraph {plist_len}" + ) + + self.prompt_template = structured_prompt + " `{text}`" + + # First translation attempt + translated_text = self.translate(new_str, False) + + # Extract paragraphs using the tags + result_list = self.extract_tagged_paragraphs(translated_text, plist_len) + + # If we still don't have the right number, try the retry approach + start_time = time.time() + if len(result_list) != plist_len: + result_list, retry_count = self.get_best_result_list( + plist_len, + new_str, + 6, # WTF this magic number here? + result_list, + ) + else: + retry_count = 0 + + end_time = time.time() # Restore original prompt self.prompt_template = original_prompt_template - start_time = time.time() - - result_list, retry_count = self.get_best_result_list( - plist_len, - new_str, - 6, # WTF this magic number here? - result_list, - ) - - end_time = time.time() - state = "fail" if len(result_list) != plist_len else "success" log_path = "log/buglog.txt" self.log_retry(state, retry_count, end_time - start_time, log_path) - self.log_translation_mismatch(plist_len, result_list, new_str, sep, log_path) + if state == "fail": + self.log_translation_mismatch( + plist_len, result_list, new_str, "\n", log_path + ) + + return result_list + + def extract_tagged_paragraphs(self, text, plist_len): + """Extract paragraphs from text with ... tags.""" + result_list = [] + + # Try extracting with tags first + for i in range(1, plist_len + 1): + pattern = rf"(.*?)" + matches = re.findall(pattern, text, re.DOTALL) + if matches: + result_list.append(matches[0].strip()) + + # If we got all paragraphs, return them + if len(result_list) == plist_len: + return result_list + + # Fallback: try general tag pattern + pattern = r"(.*?)" + matches = re.findall(pattern, text, re.DOTALL) + + if matches and len(matches) == plist_len: + # Sort by paragraph number + matches.sort(key=lambda x: int(x[0])) + result_list = [match[1].strip() for match in matches] + return result_list + + # Second fallback: try another approach with numbered paragraphs + result_list = [] + for i in range(1, plist_len + 1): + pattern = rf"\({i}\)\s*(.*?)(?=\s*\({i + 1}\)|\Z)" + match = re.search(pattern, text, re.DOTALL) + if match: + result_list.append(match.group(1).strip()) + + # If all else fails, fall back to splitting by lines + if len(result_list) != plist_len: + lines = text.splitlines() + non_empty_lines = [line.strip() for line in lines if line.strip()] + + # Attempt to find paragraph markers and divide accordingly + paragraph_markers = [ + i + for i, line in enumerate(non_empty_lines) + if re.match(r"^\s*(\(\d+\)|\d+\.)", line) + ] + + if len(paragraph_markers) == plist_len: + result_list = [] + for i in range(len(paragraph_markers)): + start = paragraph_markers[i] + end = ( + paragraph_markers[i + 1] + if i < len(paragraph_markers) - 1 + else len(non_empty_lines) + ) + paragraph = " ".join(non_empty_lines[start:end]) + result_list.append(re.sub(r"^\s*(\(\d+\)|\d+\.)\s*", "", paragraph)) + else: + # Last resort: try to split evenly + result_list = ( + non_empty_lines[:plist_len] + if len(non_empty_lines) >= plist_len + else non_empty_lines + ) - # Remove paragraph numbers from the result - result_list = [re.sub(r"^(\(\d+\)|\d+\.|(\d+))\s*", "", s) for s in result_list] return result_list def extract_paragraphs(self, text, paragraph_count): From 750ecd7d9358a6e50bcf0e182ac8e1263bf8f029 Mon Sep 17 00:00:00 2001 From: leslie Date: Sat, 19 Apr 2025 22:57:47 +0800 Subject: [PATCH 5/9] fix:Fix translation paragraph count mismatch by explicitly instructing LLM about paragraph requirements --- .../translator/chatgptapi_translator.py | 47 +++++++++++++++++-- 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/book_maker/translator/chatgptapi_translator.py b/book_maker/translator/chatgptapi_translator.py index 2c946bd..499aa63 100644 --- a/book_maker/translator/chatgptapi_translator.py +++ b/book_maker/translator/chatgptapi_translator.py @@ -377,12 +377,14 @@ class ChatGPTAPI(Base): # Save original prompt template original_prompt_template = self.prompt_template - # Create a structured prompt that forces exact paragraph count + # Create a structured prompt that forces exact paragraph count and prevents merging structured_prompt = ( f"Translate the following {plist_len} paragraphs to {{language}}. " f"CRUCIAL: Your output MUST contain EXACTLY {plist_len} paragraphs. " f"Each paragraph is wrapped in numbered tags like text. " - f"Preserve these exact tags in your output, only translating the text inside them. " + f"DO NOT merge paragraphs. Keep each paragraph separate. " + f"DO NOT combine multiple paragraphs into one. " + f"Each original paragraph should become exactly one translated paragraph. " f"Example output format: translated text for paragraph 1\ntranslated text for paragraph 2\n...\ntranslated text for paragraph {plist_len}" ) @@ -400,7 +402,7 @@ class ChatGPTAPI(Base): result_list, retry_count = self.get_best_result_list( plist_len, new_str, - 6, # WTF this magic number here? + 6, result_list, ) else: @@ -411,6 +413,38 @@ class ChatGPTAPI(Base): # Restore original prompt self.prompt_template = original_prompt_template + # Clean up the results - strip any XML tags from the final output + cleaned_result_list = [] + for paragraph in result_list: + # Remove any XML tags that might be in the output + cleaned_text = re.sub(r"(.*?)", r"\1", paragraph) + # Also clean any partial tags + cleaned_text = re.sub(r"", "", cleaned_text).strip() + cleaned_result_list.append(cleaned_text) + + # Check for merged paragraphs and attempt to split them + final_result_list = [] + for paragraph in cleaned_result_list: + # If this is potentially a merged paragraph, try to split it + if len(paragraph) > 200 and ". " in paragraph: + # Look for sentence patterns that might indicate paragraph breaks + potential_paragraphs = re.split(r"(?<=[.!?])\s+(?=[A-Z0-9])", paragraph) + # Only split if it would help us get closer to the target paragraph count + if ( + len(potential_paragraphs) > 1 + and len(final_result_list) + len(potential_paragraphs) <= plist_len + ): + final_result_list.extend(potential_paragraphs) + continue + final_result_list.append(paragraph) + + # Ensure we have plist_len paragraphs + if len(final_result_list) > plist_len: + final_result_list = final_result_list[:plist_len] + elif len(final_result_list) < plist_len: + final_result_list.extend([""] * (plist_len - len(final_result_list))) + + # Log results state = "fail" if len(result_list) != plist_len else "success" log_path = "log/buglog.txt" @@ -420,7 +454,12 @@ class ChatGPTAPI(Base): plist_len, result_list, new_str, "\n", log_path ) - return result_list + # Del paragraph numbers if any remain + final_result_list = [ + re.sub(r"^(\(\d+\)|\d+\.|(\d+))\s*", "", s) for s in final_result_list + ] + + return final_result_list def extract_tagged_paragraphs(self, text, plist_len): """Extract paragraphs from text with ... tags.""" From 09589c626da0d185da1111d3cf0715257af1a7b3 Mon Sep 17 00:00:00 2001 From: leslie Date: Sat, 19 Apr 2025 23:28:02 +0800 Subject: [PATCH 6/9] fix:Fix translation paragraph count mismatch by explicitly instructing LLM about paragraph requirements --- book_maker/translator/chatgptapi_translator.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/book_maker/translator/chatgptapi_translator.py b/book_maker/translator/chatgptapi_translator.py index 499aa63..02332ce 100644 --- a/book_maker/translator/chatgptapi_translator.py +++ b/book_maker/translator/chatgptapi_translator.py @@ -384,6 +384,7 @@ class ChatGPTAPI(Base): f"Each paragraph is wrapped in numbered tags like text. " f"DO NOT merge paragraphs. Keep each paragraph separate. " f"DO NOT combine multiple paragraphs into one. " + f"Preserve numbers at the beginning of paragraphs like '17' or '10x'. " f"Each original paragraph should become exactly one translated paragraph. " f"Example output format: translated text for paragraph 1\ntranslated text for paragraph 2\n...\ntranslated text for paragraph {plist_len}" ) @@ -438,7 +439,7 @@ class ChatGPTAPI(Base): continue final_result_list.append(paragraph) - # Ensure we have plist_len paragraphs + # Ensure we have exactly plist_len paragraphs if len(final_result_list) > plist_len: final_result_list = final_result_list[:plist_len] elif len(final_result_list) < plist_len: @@ -454,9 +455,9 @@ class ChatGPTAPI(Base): plist_len, result_list, new_str, "\n", log_path ) - # Del paragraph numbers if any remain + # Remove ONLY the paragraph numbering formats, not all numbers at the start final_result_list = [ - re.sub(r"^(\(\d+\)|\d+\.|(\d+))\s*", "", s) for s in final_result_list + re.sub(r"^(\(\d+\)|\d+\.)\s*", "", s) for s in final_result_list ] return final_result_list From 57ca4da847abbddb772c9889c1506012226a22c0 Mon Sep 17 00:00:00 2001 From: leslie Date: Sun, 20 Apr 2025 16:48:11 +0800 Subject: [PATCH 7/9] fix:Fix translation paragraph count mismatch by explicitly instructing LLM about paragraph requirements --- .../translator/chatgptapi_translator.py | 158 +++++++++--------- 1 file changed, 81 insertions(+), 77 deletions(-) diff --git a/book_maker/translator/chatgptapi_translator.py b/book_maker/translator/chatgptapi_translator.py index 02332ce..67c6239 100644 --- a/book_maker/translator/chatgptapi_translator.py +++ b/book_maker/translator/chatgptapi_translator.py @@ -361,106 +361,110 @@ class ChatGPTAPI(Base): def translate_list(self, plist): plist_len = len(plist) - # Format input with explicit paragraph numbering - formatted_paragraphs = [] + # 创建原始文本列表,并为每个段落添加明确的编号标记 + formatted_text = "" for i, p in enumerate(plist, 1): temp_p = copy(p) for sup in temp_p.find_all("sup"): sup.extract() - formatted_paragraphs.append(f"{temp_p.get_text().strip()}") - - # Join with single newlines for cleaner input - new_str = "\n".join(formatted_paragraphs) + para_text = temp_p.get_text().strip() + # 使用特殊的分隔符和明确的编号 + formatted_text += f"PARAGRAPH {i}:\n{para_text}\n\n" print(f"plist len = {plist_len}") - # Save original prompt template + # 保存原始提示模板 original_prompt_template = self.prompt_template - # Create a structured prompt that forces exact paragraph count and prevents merging + # 创建明确要求保持段落结构的提示 structured_prompt = ( f"Translate the following {plist_len} paragraphs to {{language}}. " - f"CRUCIAL: Your output MUST contain EXACTLY {plist_len} paragraphs. " - f"Each paragraph is wrapped in numbered tags like text. " - f"DO NOT merge paragraphs. Keep each paragraph separate. " - f"DO NOT combine multiple paragraphs into one. " - f"Preserve numbers at the beginning of paragraphs like '17' or '10x'. " - f"Each original paragraph should become exactly one translated paragraph. " - f"Example output format: translated text for paragraph 1\ntranslated text for paragraph 2\n...\ntranslated text for paragraph {plist_len}" + f"CRUCIAL INSTRUCTION: Format your response using EXACTLY this structure:\n\n" + f"TRANSLATION OF PARAGRAPH 1:\n[Your translation of paragraph 1 here]\n\n" + f"TRANSLATION OF PARAGRAPH 2:\n[Your translation of paragraph 2 here]\n\n" + f"... and so on for all {plist_len} paragraphs.\n\n" + f"You MUST provide EXACTLY {plist_len} translated paragraphs. " + f"Do not merge, split, or rearrange paragraphs. " + f"Translate each paragraph independently but consistently. " + f"Keep all numbers and special formatting in your translation. " + f"Each original paragraph must correspond to exactly one translated paragraph." ) - self.prompt_template = structured_prompt + " `{text}`" + self.prompt_template = structured_prompt + " ```{text}```" - # First translation attempt - translated_text = self.translate(new_str, False) + # 翻译 + translated_text = self.translate(formatted_text, False) - # Extract paragraphs using the tags - result_list = self.extract_tagged_paragraphs(translated_text, plist_len) - - # If we still don't have the right number, try the retry approach - start_time = time.time() - if len(result_list) != plist_len: - result_list, retry_count = self.get_best_result_list( - plist_len, - new_str, - 6, - result_list, + # 从结构化输出中提取翻译 + translated_paragraphs = [] + for i in range(1, plist_len + 1): + pattern = ( + r"TRANSLATION OF PARAGRAPH " + + str(i) + + r":(.*?)(?=TRANSLATION OF PARAGRAPH \d+:|\Z)" ) - else: - retry_count = 0 + matches = re.findall(pattern, translated_text, re.DOTALL) - end_time = time.time() + if matches: + translated_paragraph = matches[0].strip() + translated_paragraphs.append(translated_paragraph) + else: + print(f"Warning: Could not find translation for paragraph {i}") + # 尝试更宽松的匹配 + loose_pattern = ( + r"(?:TRANSLATION|PARAGRAPH|PARA).*?" + + str(i) + + r".*?:(.*?)(?=(?:TRANSLATION|PARAGRAPH|PARA).*?\d+.*?:|\Z)" + ) + loose_matches = re.findall(loose_pattern, translated_text, re.DOTALL) + if loose_matches: + translated_paragraphs.append(loose_matches[0].strip()) + else: + translated_paragraphs.append("") - # Restore original prompt + # 恢复原始提示 self.prompt_template = original_prompt_template - # Clean up the results - strip any XML tags from the final output - cleaned_result_list = [] - for paragraph in result_list: - # Remove any XML tags that might be in the output - cleaned_text = re.sub(r"(.*?)", r"\1", paragraph) - # Also clean any partial tags - cleaned_text = re.sub(r"", "", cleaned_text).strip() - cleaned_result_list.append(cleaned_text) - - # Check for merged paragraphs and attempt to split them - final_result_list = [] - for paragraph in cleaned_result_list: - # If this is potentially a merged paragraph, try to split it - if len(paragraph) > 200 and ". " in paragraph: - # Look for sentence patterns that might indicate paragraph breaks - potential_paragraphs = re.split(r"(?<=[.!?])\s+(?=[A-Z0-9])", paragraph) - # Only split if it would help us get closer to the target paragraph count - if ( - len(potential_paragraphs) > 1 - and len(final_result_list) + len(potential_paragraphs) <= plist_len - ): - final_result_list.extend(potential_paragraphs) - continue - final_result_list.append(paragraph) - - # Ensure we have exactly plist_len paragraphs - if len(final_result_list) > plist_len: - final_result_list = final_result_list[:plist_len] - elif len(final_result_list) < plist_len: - final_result_list.extend([""] * (plist_len - len(final_result_list))) - - # Log results - state = "fail" if len(result_list) != plist_len else "success" - log_path = "log/buglog.txt" - - self.log_retry(state, retry_count, end_time - start_time, log_path) - if state == "fail": - self.log_translation_mismatch( - plist_len, result_list, new_str, "\n", log_path + # 如果提取到的段落数不正确,尝试备用提取方法 + if len(translated_paragraphs) != plist_len: + print( + f"Warning: Extracted {len(translated_paragraphs)}/{plist_len} paragraphs. Using fallback extraction." ) + # 提取所有可能的段落标记 + all_para_pattern = r"(?:TRANSLATION|PARAGRAPH|PARA).*?(\d+).*?:(.*?)(?=(?:TRANSLATION|PARAGRAPH|PARA).*?\d+.*?:|\Z)" + all_matches = re.findall(all_para_pattern, translated_text, re.DOTALL) - # Remove ONLY the paragraph numbering formats, not all numbers at the start - final_result_list = [ - re.sub(r"^(\(\d+\)|\d+\.)\s*", "", s) for s in final_result_list - ] + if all_matches: + # 创建一个字典,根据段落编号映射翻译内容 + para_dict = {} + for num_str, content in all_matches: + try: + num = int(num_str) + if 1 <= num <= plist_len: + para_dict[num] = content.strip() + except ValueError: + continue - return final_result_list + # 按原始顺序重建翻译列表 + new_translated_paragraphs = [] + for i in range(1, plist_len + 1): + if i in para_dict: + new_translated_paragraphs.append(para_dict[i]) + else: + new_translated_paragraphs.append("") + + if len(new_translated_paragraphs) == plist_len: + translated_paragraphs = new_translated_paragraphs + + # 确保最终有正确数量的段落 + if len(translated_paragraphs) < plist_len: + translated_paragraphs.extend( + [""] * (plist_len - len(translated_paragraphs)) + ) + elif len(translated_paragraphs) > plist_len: + translated_paragraphs = translated_paragraphs[:plist_len] + + return translated_paragraphs def extract_tagged_paragraphs(self, text, plist_len): """Extract paragraphs from text with ... tags.""" From cc4f4c4daed9fcf5dac9d9b9f729d406194771c3 Mon Sep 17 00:00:00 2001 From: leslie Date: Mon, 21 Apr 2025 15:05:21 +0800 Subject: [PATCH 8/9] fix:Fix translation paragraph count mismatch by explicitly instructing LLM about paragraph requirements --- .../translator/chatgptapi_translator.py | 128 +----------------- 1 file changed, 1 insertion(+), 127 deletions(-) diff --git a/book_maker/translator/chatgptapi_translator.py b/book_maker/translator/chatgptapi_translator.py index 67c6239..6d11437 100644 --- a/book_maker/translator/chatgptapi_translator.py +++ b/book_maker/translator/chatgptapi_translator.py @@ -75,7 +75,7 @@ class ChatGPTAPI(Base): api_base=None, prompt_template=None, prompt_sys_msg=None, - temperature=1.0, + temperature=1.3, context_flag=False, context_paragraph_limit=0, **kwargs, @@ -230,66 +230,6 @@ class ChatGPTAPI(Base): lines = [line.strip() for line in lines if line.strip() != ""] return lines - def get_best_result_list( - self, - plist_len, - new_str, - sleep_dur, - result_list, - max_retries=15, - ): - if len(result_list) == plist_len: - return result_list, 0 - best_result_list = result_list - retry_count = 0 - - # Save original prompt template - original_prompt_template = self.prompt_template - while retry_count < max_retries and len(result_list) != plist_len: - print( - f"bug: {plist_len} -> {len(result_list)} : Number of paragraphs before and after translation", - ) - print(f"sleep for {sleep_dur}s and retry {retry_count + 1} ...") - time.sleep(sleep_dur) - retry_count += 1 - - # Create increasingly strict prompts - structured_prompt = ( - f"CRITICAL!!! Translate the following {plist_len} paragraphs to {{language}}. " - f"Your output MUST have EXACTLY {plist_len} paragraphs - NO MORE, NO LESS. " - f"Each paragraph must be wrapped in numbered XML tags: text, text, etc. " - f"DO NOT skip any paragraph numbers. DO NOT add extra paragraphs. " - f"Required format: translated text\ntranslated text\n...\ntranslated text" - ) - - self.prompt_template = structured_prompt + " `{text}`" - - translated_text = self.translate(new_str, False) - result_list = self.extract_tagged_paragraphs(translated_text, plist_len) - - if ( - len(result_list) == plist_len - or len(best_result_list) < len(result_list) <= plist_len - or ( - len(result_list) < len(best_result_list) - and len(best_result_list) > plist_len - ) - ): - best_result_list = result_list - # Restore original prompt - self.prompt_template = original_prompt_template - - # If we still don't have the right number, force it by padding or trimming - if len(best_result_list) != plist_len: - if len(best_result_list) < plist_len: - # Pad with empty strings if we have too few - best_result_list.extend([""] * (plist_len - len(best_result_list))) - else: - # Trim if we have too many - best_result_list = best_result_list[:plist_len] - - return best_result_list, retry_count - def log_retry(self, state, retry_count, elapsed_time, log_path="log/buglog.txt"): if retry_count == 0: return @@ -466,72 +406,6 @@ class ChatGPTAPI(Base): return translated_paragraphs - def extract_tagged_paragraphs(self, text, plist_len): - """Extract paragraphs from text with ... tags.""" - result_list = [] - - # Try extracting with tags first - for i in range(1, plist_len + 1): - pattern = rf"(.*?)" - matches = re.findall(pattern, text, re.DOTALL) - if matches: - result_list.append(matches[0].strip()) - - # If we got all paragraphs, return them - if len(result_list) == plist_len: - return result_list - - # Fallback: try general tag pattern - pattern = r"(.*?)" - matches = re.findall(pattern, text, re.DOTALL) - - if matches and len(matches) == plist_len: - # Sort by paragraph number - matches.sort(key=lambda x: int(x[0])) - result_list = [match[1].strip() for match in matches] - return result_list - - # Second fallback: try another approach with numbered paragraphs - result_list = [] - for i in range(1, plist_len + 1): - pattern = rf"\({i}\)\s*(.*?)(?=\s*\({i + 1}\)|\Z)" - match = re.search(pattern, text, re.DOTALL) - if match: - result_list.append(match.group(1).strip()) - - # If all else fails, fall back to splitting by lines - if len(result_list) != plist_len: - lines = text.splitlines() - non_empty_lines = [line.strip() for line in lines if line.strip()] - - # Attempt to find paragraph markers and divide accordingly - paragraph_markers = [ - i - for i, line in enumerate(non_empty_lines) - if re.match(r"^\s*(\(\d+\)|\d+\.)", line) - ] - - if len(paragraph_markers) == plist_len: - result_list = [] - for i in range(len(paragraph_markers)): - start = paragraph_markers[i] - end = ( - paragraph_markers[i + 1] - if i < len(paragraph_markers) - 1 - else len(non_empty_lines) - ) - paragraph = " ".join(non_empty_lines[start:end]) - result_list.append(re.sub(r"^\s*(\(\d+\)|\d+\.)\s*", "", paragraph)) - else: - # Last resort: try to split evenly - result_list = ( - non_empty_lines[:plist_len] - if len(non_empty_lines) >= plist_len - else non_empty_lines - ) - - return result_list - def extract_paragraphs(self, text, paragraph_count): """Extract paragraphs from translated text, ensuring paragraph count is preserved.""" # First try to extract by paragraph numbers (1), (2), etc. From c780f7c516c0d79c8da467f103f40dfb1b0a41bd Mon Sep 17 00:00:00 2001 From: leslie Date: Mon, 21 Apr 2025 16:11:33 +0800 Subject: [PATCH 9/9] fix:Fix translation paragraph count mismatch by explicitly instructing LLM about paragraph requirements --- .../translator/chatgptapi_translator.py | 22 +++++++------------ 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/book_maker/translator/chatgptapi_translator.py b/book_maker/translator/chatgptapi_translator.py index 6d11437..2dc9f14 100644 --- a/book_maker/translator/chatgptapi_translator.py +++ b/book_maker/translator/chatgptapi_translator.py @@ -75,7 +75,7 @@ class ChatGPTAPI(Base): api_base=None, prompt_template=None, prompt_sys_msg=None, - temperature=1.3, + temperature=1.0, context_flag=False, context_paragraph_limit=0, **kwargs, @@ -301,22 +301,20 @@ class ChatGPTAPI(Base): def translate_list(self, plist): plist_len = len(plist) - # 创建原始文本列表,并为每个段落添加明确的编号标记 + # Create a list of original texts and add clear numbering markers to each paragraph formatted_text = "" for i, p in enumerate(plist, 1): temp_p = copy(p) for sup in temp_p.find_all("sup"): sup.extract() para_text = temp_p.get_text().strip() - # 使用特殊的分隔符和明确的编号 + # Using special delimiters and clear numbering formatted_text += f"PARAGRAPH {i}:\n{para_text}\n\n" print(f"plist len = {plist_len}") - # 保存原始提示模板 original_prompt_template = self.prompt_template - # 创建明确要求保持段落结构的提示 structured_prompt = ( f"Translate the following {plist_len} paragraphs to {{language}}. " f"CRUCIAL INSTRUCTION: Format your response using EXACTLY this structure:\n\n" @@ -332,10 +330,9 @@ class ChatGPTAPI(Base): self.prompt_template = structured_prompt + " ```{text}```" - # 翻译 translated_text = self.translate(formatted_text, False) - # 从结构化输出中提取翻译 + # Extract translations from structured output translated_paragraphs = [] for i in range(1, plist_len + 1): pattern = ( @@ -350,7 +347,6 @@ class ChatGPTAPI(Base): translated_paragraphs.append(translated_paragraph) else: print(f"Warning: Could not find translation for paragraph {i}") - # 尝试更宽松的匹配 loose_pattern = ( r"(?:TRANSLATION|PARAGRAPH|PARA).*?" + str(i) @@ -362,20 +358,19 @@ class ChatGPTAPI(Base): else: translated_paragraphs.append("") - # 恢复原始提示 self.prompt_template = original_prompt_template - # 如果提取到的段落数不正确,尝试备用提取方法 + # If the number of extracted paragraphs is incorrect, try the alternative extraction method. if len(translated_paragraphs) != plist_len: print( f"Warning: Extracted {len(translated_paragraphs)}/{plist_len} paragraphs. Using fallback extraction." ) - # 提取所有可能的段落标记 + all_para_pattern = r"(?:TRANSLATION|PARAGRAPH|PARA).*?(\d+).*?:(.*?)(?=(?:TRANSLATION|PARAGRAPH|PARA).*?\d+.*?:|\Z)" all_matches = re.findall(all_para_pattern, translated_text, re.DOTALL) if all_matches: - # 创建一个字典,根据段落编号映射翻译内容 + # Create a dictionary to map translation content based on paragraph numbers para_dict = {} for num_str, content in all_matches: try: @@ -385,7 +380,7 @@ class ChatGPTAPI(Base): except ValueError: continue - # 按原始顺序重建翻译列表 + # Rebuild the translation list in the original order new_translated_paragraphs = [] for i in range(1, plist_len + 1): if i in para_dict: @@ -396,7 +391,6 @@ class ChatGPTAPI(Base): if len(new_translated_paragraphs) == plist_len: translated_paragraphs = new_translated_paragraphs - # 确保最终有正确数量的段落 if len(translated_paragraphs) < plist_len: translated_paragraphs.extend( [""] * (plist_len - len(translated_paragraphs))