From 70a19628041020b0df1e016895d416a2b8a86a0d Mon Sep 17 00:00:00 2001 From: leslie Date: Sat, 19 Apr 2025 22:18:47 +0800 Subject: [PATCH] fix:Fix translation paragraph count mismatch by explicitly instructing LLM about paragraph requirements --- .../translator/chatgptapi_translator.py | 167 ++++++++++++++---- 1 file changed, 129 insertions(+), 38 deletions(-) diff --git a/book_maker/translator/chatgptapi_translator.py b/book_maker/translator/chatgptapi_translator.py index 08fd226..2c946bd 100644 --- a/book_maker/translator/chatgptapi_translator.py +++ b/book_maker/translator/chatgptapi_translator.py @@ -240,29 +240,33 @@ class ChatGPTAPI(Base): ): if len(result_list) == plist_len: return result_list, 0 - best_result_list = result_list retry_count = 0 # Save original prompt template original_prompt_template = self.prompt_template - while retry_count < max_retries and len(result_list) != plist_len: print( f"bug: {plist_len} -> {len(result_list)} : Number of paragraphs before and after translation", ) - print(f"sleep for {sleep_dur}s and retry {retry_count+1} ...") + print(f"sleep for {sleep_dur}s and retry {retry_count + 1} ...") time.sleep(sleep_dur) retry_count += 1 - # Make instructions increasingly explicit with each retry - emphasis = "!" * min(retry_count, 3) # Add up to 3 exclamation marks - paragraph_instruction = f"IMPORTANT{emphasis} The text contains exactly {plist_len} numbered paragraphs. Your translation MUST maintain exactly {plist_len} paragraphs with the same numbering structure." + # Create increasingly strict prompts + structured_prompt = ( + f"CRITICAL!!! Translate the following {plist_len} paragraphs to {{language}}. " + f"Your output MUST have EXACTLY {plist_len} paragraphs - NO MORE, NO LESS. " + f"Each paragraph must be wrapped in numbered XML tags: text, text, etc. " + f"DO NOT skip any paragraph numbers. DO NOT add extra paragraphs. " + f"Required format: translated text\ntranslated text\n...\ntranslated text" + ) - # Extend the original prompt - self.prompt_template = f"{original_prompt_template} {paragraph_instruction}" + self.prompt_template = structured_prompt + " `{text}`" + + translated_text = self.translate(new_str, False) + result_list = self.extract_tagged_paragraphs(translated_text, plist_len) - result_list = self.translate_and_split_lines(new_str) if ( len(result_list) == plist_len or len(best_result_list) < len(result_list) <= plist_len @@ -272,10 +276,18 @@ class ChatGPTAPI(Base): ) ): best_result_list = result_list - # Restore original prompt self.prompt_template = original_prompt_template + # If we still don't have the right number, force it by padding or trimming + if len(best_result_list) != plist_len: + if len(best_result_list) < plist_len: + # Pad with empty strings if we have too few + best_result_list.extend([""] * (plist_len - len(best_result_list))) + else: + # Trim if we have too many + best_result_list = best_result_list[:plist_len] + return best_result_list, retry_count def log_retry(self, state, retry_count, elapsed_time, log_path="log/buglog.txt"): @@ -347,54 +359,133 @@ class ChatGPTAPI(Base): return new_text def translate_list(self, plist): - sep = "\n\n\n\n\n" plist_len = len(plist) - new_str = "" - i = 1 - for p in plist: + # Format input with explicit paragraph numbering + formatted_paragraphs = [] + for i, p in enumerate(plist, 1): temp_p = copy(p) for sup in temp_p.find_all("sup"): sup.extract() - new_str += f"({i}) {temp_p.get_text().strip()}{sep}" - i = i + 1 + formatted_paragraphs.append(f"{temp_p.get_text().strip()}") - if new_str.endswith(sep): - new_str = new_str[: -len(sep)] - - new_str = self.join_lines(new_str) + # Join with single newlines for cleaner input + new_str = "\n".join(formatted_paragraphs) print(f"plist len = {plist_len}") - # Preserve original prompt and append paragraph count requirements + # Save original prompt template original_prompt_template = self.prompt_template - self.prompt_template = f"{original_prompt_template} The text contains exactly {plist_len} paragraphs numbered as (1), (2), etc. Your translation MUST maintain exactly {plist_len} paragraphs with the same numbering." - # Translate with enhanced prompt - result_list = self.translate_and_split_lines(new_str) + # Create a structured prompt that forces exact paragraph count + structured_prompt = ( + f"Translate the following {plist_len} paragraphs to {{language}}. " + f"CRUCIAL: Your output MUST contain EXACTLY {plist_len} paragraphs. " + f"Each paragraph is wrapped in numbered tags like text. " + f"Preserve these exact tags in your output, only translating the text inside them. " + f"Example output format: translated text for paragraph 1\ntranslated text for paragraph 2\n...\ntranslated text for paragraph {plist_len}" + ) + + self.prompt_template = structured_prompt + " `{text}`" + + # First translation attempt + translated_text = self.translate(new_str, False) + + # Extract paragraphs using the tags + result_list = self.extract_tagged_paragraphs(translated_text, plist_len) + + # If we still don't have the right number, try the retry approach + start_time = time.time() + if len(result_list) != plist_len: + result_list, retry_count = self.get_best_result_list( + plist_len, + new_str, + 6, # WTF this magic number here? + result_list, + ) + else: + retry_count = 0 + + end_time = time.time() # Restore original prompt self.prompt_template = original_prompt_template - start_time = time.time() - - result_list, retry_count = self.get_best_result_list( - plist_len, - new_str, - 6, # WTF this magic number here? - result_list, - ) - - end_time = time.time() - state = "fail" if len(result_list) != plist_len else "success" log_path = "log/buglog.txt" self.log_retry(state, retry_count, end_time - start_time, log_path) - self.log_translation_mismatch(plist_len, result_list, new_str, sep, log_path) + if state == "fail": + self.log_translation_mismatch( + plist_len, result_list, new_str, "\n", log_path + ) + + return result_list + + def extract_tagged_paragraphs(self, text, plist_len): + """Extract paragraphs from text with ... tags.""" + result_list = [] + + # Try extracting with tags first + for i in range(1, plist_len + 1): + pattern = rf"(.*?)" + matches = re.findall(pattern, text, re.DOTALL) + if matches: + result_list.append(matches[0].strip()) + + # If we got all paragraphs, return them + if len(result_list) == plist_len: + return result_list + + # Fallback: try general tag pattern + pattern = r"(.*?)" + matches = re.findall(pattern, text, re.DOTALL) + + if matches and len(matches) == plist_len: + # Sort by paragraph number + matches.sort(key=lambda x: int(x[0])) + result_list = [match[1].strip() for match in matches] + return result_list + + # Second fallback: try another approach with numbered paragraphs + result_list = [] + for i in range(1, plist_len + 1): + pattern = rf"\({i}\)\s*(.*?)(?=\s*\({i + 1}\)|\Z)" + match = re.search(pattern, text, re.DOTALL) + if match: + result_list.append(match.group(1).strip()) + + # If all else fails, fall back to splitting by lines + if len(result_list) != plist_len: + lines = text.splitlines() + non_empty_lines = [line.strip() for line in lines if line.strip()] + + # Attempt to find paragraph markers and divide accordingly + paragraph_markers = [ + i + for i, line in enumerate(non_empty_lines) + if re.match(r"^\s*(\(\d+\)|\d+\.)", line) + ] + + if len(paragraph_markers) == plist_len: + result_list = [] + for i in range(len(paragraph_markers)): + start = paragraph_markers[i] + end = ( + paragraph_markers[i + 1] + if i < len(paragraph_markers) - 1 + else len(non_empty_lines) + ) + paragraph = " ".join(non_empty_lines[start:end]) + result_list.append(re.sub(r"^\s*(\(\d+\)|\d+\.)\s*", "", paragraph)) + else: + # Last resort: try to split evenly + result_list = ( + non_empty_lines[:plist_len] + if len(non_empty_lines) >= plist_len + else non_empty_lines + ) - # Remove paragraph numbers from the result - result_list = [re.sub(r"^(\(\d+\)|\d+\.|(\d+))\s*", "", s) for s in result_list] return result_list def extract_paragraphs(self, text, paragraph_count):