From 09589c626da0d185da1111d3cf0715257af1a7b3 Mon Sep 17 00:00:00 2001 From: leslie Date: Sat, 19 Apr 2025 23:28:02 +0800 Subject: [PATCH] fix:Fix translation paragraph count mismatch by explicitly instructing LLM about paragraph requirements --- book_maker/translator/chatgptapi_translator.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/book_maker/translator/chatgptapi_translator.py b/book_maker/translator/chatgptapi_translator.py index 499aa63..02332ce 100644 --- a/book_maker/translator/chatgptapi_translator.py +++ b/book_maker/translator/chatgptapi_translator.py @@ -384,6 +384,7 @@ class ChatGPTAPI(Base): f"Each paragraph is wrapped in numbered tags like text. " f"DO NOT merge paragraphs. Keep each paragraph separate. " f"DO NOT combine multiple paragraphs into one. " + f"Preserve numbers at the beginning of paragraphs like '17' or '10x'. " f"Each original paragraph should become exactly one translated paragraph. " f"Example output format: translated text for paragraph 1\ntranslated text for paragraph 2\n...\ntranslated text for paragraph {plist_len}" ) @@ -438,7 +439,7 @@ class ChatGPTAPI(Base): continue final_result_list.append(paragraph) - # Ensure we have plist_len paragraphs + # Ensure we have exactly plist_len paragraphs if len(final_result_list) > plist_len: final_result_list = final_result_list[:plist_len] elif len(final_result_list) < plist_len: @@ -454,9 +455,9 @@ class ChatGPTAPI(Base): plist_len, result_list, new_str, "\n", log_path ) - # Del paragraph numbers if any remain + # Remove ONLY the paragraph numbering formats, not all numbers at the start final_result_list = [ - re.sub(r"^(\(\d+\)|\d+\.|(\d+))\s*", "", s) for s in final_result_list + re.sub(r"^(\(\d+\)|\d+\.)\s*", "", s) for s in final_result_list ] return final_result_list