From 83303d1dd844a5997f90bc117db129eeb2312a75 Mon Sep 17 00:00:00 2001 From: leslie Date: Sat, 19 Apr 2025 20:31:22 +0800 Subject: [PATCH] fix:Fix translation paragraph count mismatch by explicitly instructing LLM about paragraph requirements --- book_maker/translator/chatgptapi_translator.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/book_maker/translator/chatgptapi_translator.py b/book_maker/translator/chatgptapi_translator.py index b1a6585..08fd226 100644 --- a/book_maker/translator/chatgptapi_translator.py +++ b/book_maker/translator/chatgptapi_translator.py @@ -256,8 +256,7 @@ class ChatGPTAPI(Base): retry_count += 1 # Make instructions increasingly explicit with each retry - emphasis = "!" * min(retry_count, - 3) # Add up to 3 exclamation marks + emphasis = "!" * min(retry_count, 3) # Add up to 3 exclamation marks paragraph_instruction = f"IMPORTANT{emphasis} The text contains exactly {plist_len} numbered paragraphs. Your translation MUST maintain exactly {plist_len} paragraphs with the same numbering structure." # Extend the original prompt @@ -392,12 +391,10 @@ class ChatGPTAPI(Base): log_path = "log/buglog.txt" self.log_retry(state, retry_count, end_time - start_time, log_path) - self.log_translation_mismatch(plist_len, result_list, new_str, sep, - log_path) + self.log_translation_mismatch(plist_len, result_list, new_str, sep, log_path) # Remove paragraph numbers from the result - result_list = [re.sub(r"^(\(\d+\)|\d+\.|(\d+))\s*", "", s) for s in - result_list] + result_list = [re.sub(r"^(\(\d+\)|\d+\.|(\d+))\s*", "", s) for s in result_list] return result_list def extract_paragraphs(self, text, paragraph_count): @@ -405,14 +402,14 @@ class ChatGPTAPI(Base): # First try to extract by paragraph numbers (1), (2), etc. result_list = [] for i in range(1, paragraph_count + 1): - pattern = rf'\({i}\)\s*(.*?)(?=\s*\({i + 1}\)|\Z)' + pattern = rf"\({i}\)\s*(.*?)(?=\s*\({i + 1}\)|\Z)" match = re.search(pattern, text, re.DOTALL) if match: result_list.append(match.group(1).strip()) # If exact pattern matching failed, try another approach if len(result_list) != paragraph_count: - pattern = r'\((\d+)\)\s*(.*?)(?=\s*\(\d+\)|\Z)' + pattern = r"\((\d+)\)\s*(.*?)(?=\s*\(\d+\)|\Z)" matches = re.findall(pattern, text, re.DOTALL) if matches: # Sort by paragraph number