mirror of
https://github.com/yihong0618/bilingual_book_maker.git
synced 2025-07-19 08:30:16 +00:00
fix:Fix translation paragraph count mismatch by explicitly instructing LLM about paragraph requirements
This commit is contained in:
parent
83303d1dd8
commit
70a1962804
@ -240,29 +240,33 @@ class ChatGPTAPI(Base):
|
||||
):
|
||||
if len(result_list) == plist_len:
|
||||
return result_list, 0
|
||||
|
||||
best_result_list = result_list
|
||||
retry_count = 0
|
||||
|
||||
# Save original prompt template
|
||||
original_prompt_template = self.prompt_template
|
||||
|
||||
while retry_count < max_retries and len(result_list) != plist_len:
|
||||
print(
|
||||
f"bug: {plist_len} -> {len(result_list)} : Number of paragraphs before and after translation",
|
||||
)
|
||||
print(f"sleep for {sleep_dur}s and retry {retry_count+1} ...")
|
||||
print(f"sleep for {sleep_dur}s and retry {retry_count + 1} ...")
|
||||
time.sleep(sleep_dur)
|
||||
retry_count += 1
|
||||
|
||||
# Make instructions increasingly explicit with each retry
|
||||
emphasis = "!" * min(retry_count, 3) # Add up to 3 exclamation marks
|
||||
paragraph_instruction = f"IMPORTANT{emphasis} The text contains exactly {plist_len} numbered paragraphs. Your translation MUST maintain exactly {plist_len} paragraphs with the same numbering structure."
|
||||
# Create increasingly strict prompts
|
||||
structured_prompt = (
|
||||
f"CRITICAL!!! Translate the following {plist_len} paragraphs to {{language}}. "
|
||||
f"Your output MUST have EXACTLY {plist_len} paragraphs - NO MORE, NO LESS. "
|
||||
f"Each paragraph must be wrapped in numbered XML tags: <p1>text</p1>, <p2>text</p2>, etc. "
|
||||
f"DO NOT skip any paragraph numbers. DO NOT add extra paragraphs. "
|
||||
f"Required format: <p1>translated text</p1>\n<p2>translated text</p2>\n...\n<p{plist_len}>translated text</p{plist_len}>"
|
||||
)
|
||||
|
||||
# Extend the original prompt
|
||||
self.prompt_template = f"{original_prompt_template} {paragraph_instruction}"
|
||||
self.prompt_template = structured_prompt + " `{text}`"
|
||||
|
||||
translated_text = self.translate(new_str, False)
|
||||
result_list = self.extract_tagged_paragraphs(translated_text, plist_len)
|
||||
|
||||
result_list = self.translate_and_split_lines(new_str)
|
||||
if (
|
||||
len(result_list) == plist_len
|
||||
or len(best_result_list) < len(result_list) <= plist_len
|
||||
@ -272,10 +276,18 @@ class ChatGPTAPI(Base):
|
||||
)
|
||||
):
|
||||
best_result_list = result_list
|
||||
|
||||
# Restore original prompt
|
||||
self.prompt_template = original_prompt_template
|
||||
|
||||
# If we still don't have the right number, force it by padding or trimming
|
||||
if len(best_result_list) != plist_len:
|
||||
if len(best_result_list) < plist_len:
|
||||
# Pad with empty strings if we have too few
|
||||
best_result_list.extend([""] * (plist_len - len(best_result_list)))
|
||||
else:
|
||||
# Trim if we have too many
|
||||
best_result_list = best_result_list[:plist_len]
|
||||
|
||||
return best_result_list, retry_count
|
||||
|
||||
def log_retry(self, state, retry_count, elapsed_time, log_path="log/buglog.txt"):
|
||||
@ -347,54 +359,133 @@ class ChatGPTAPI(Base):
|
||||
return new_text
|
||||
|
||||
def translate_list(self, plist):
|
||||
sep = "\n\n\n\n\n"
|
||||
plist_len = len(plist)
|
||||
|
||||
new_str = ""
|
||||
i = 1
|
||||
for p in plist:
|
||||
# Format input with explicit paragraph numbering
|
||||
formatted_paragraphs = []
|
||||
for i, p in enumerate(plist, 1):
|
||||
temp_p = copy(p)
|
||||
for sup in temp_p.find_all("sup"):
|
||||
sup.extract()
|
||||
new_str += f"({i}) {temp_p.get_text().strip()}{sep}"
|
||||
i = i + 1
|
||||
formatted_paragraphs.append(f"<p{i}>{temp_p.get_text().strip()}</p{i}>")
|
||||
|
||||
if new_str.endswith(sep):
|
||||
new_str = new_str[: -len(sep)]
|
||||
|
||||
new_str = self.join_lines(new_str)
|
||||
# Join with single newlines for cleaner input
|
||||
new_str = "\n".join(formatted_paragraphs)
|
||||
|
||||
print(f"plist len = {plist_len}")
|
||||
|
||||
# Preserve original prompt and append paragraph count requirements
|
||||
# Save original prompt template
|
||||
original_prompt_template = self.prompt_template
|
||||
self.prompt_template = f"{original_prompt_template} The text contains exactly {plist_len} paragraphs numbered as (1), (2), etc. Your translation MUST maintain exactly {plist_len} paragraphs with the same numbering."
|
||||
|
||||
# Translate with enhanced prompt
|
||||
result_list = self.translate_and_split_lines(new_str)
|
||||
# Create a structured prompt that forces exact paragraph count
|
||||
structured_prompt = (
|
||||
f"Translate the following {plist_len} paragraphs to {{language}}. "
|
||||
f"CRUCIAL: Your output MUST contain EXACTLY {plist_len} paragraphs. "
|
||||
f"Each paragraph is wrapped in numbered tags like <p1>text</p1>. "
|
||||
f"Preserve these exact tags in your output, only translating the text inside them. "
|
||||
f"Example output format: <p1>translated text for paragraph 1</p1>\n<p2>translated text for paragraph 2</p2>\n...\n<p{plist_len}>translated text for paragraph {plist_len}</p{plist_len}>"
|
||||
)
|
||||
|
||||
# Restore original prompt
|
||||
self.prompt_template = original_prompt_template
|
||||
self.prompt_template = structured_prompt + " `{text}`"
|
||||
|
||||
# First translation attempt
|
||||
translated_text = self.translate(new_str, False)
|
||||
|
||||
# Extract paragraphs using the tags
|
||||
result_list = self.extract_tagged_paragraphs(translated_text, plist_len)
|
||||
|
||||
# If we still don't have the right number, try the retry approach
|
||||
start_time = time.time()
|
||||
|
||||
if len(result_list) != plist_len:
|
||||
result_list, retry_count = self.get_best_result_list(
|
||||
plist_len,
|
||||
new_str,
|
||||
6, # WTF this magic number here?
|
||||
result_list,
|
||||
)
|
||||
else:
|
||||
retry_count = 0
|
||||
|
||||
end_time = time.time()
|
||||
|
||||
# Restore original prompt
|
||||
self.prompt_template = original_prompt_template
|
||||
|
||||
state = "fail" if len(result_list) != plist_len else "success"
|
||||
log_path = "log/buglog.txt"
|
||||
|
||||
self.log_retry(state, retry_count, end_time - start_time, log_path)
|
||||
self.log_translation_mismatch(plist_len, result_list, new_str, sep, log_path)
|
||||
if state == "fail":
|
||||
self.log_translation_mismatch(
|
||||
plist_len, result_list, new_str, "\n", log_path
|
||||
)
|
||||
|
||||
return result_list
|
||||
|
||||
def extract_tagged_paragraphs(self, text, plist_len):
|
||||
"""Extract paragraphs from text with <p1>...</p1> tags."""
|
||||
result_list = []
|
||||
|
||||
# Try extracting with tags first
|
||||
for i in range(1, plist_len + 1):
|
||||
pattern = rf"<p{i}>(.*?)</p{i}>"
|
||||
matches = re.findall(pattern, text, re.DOTALL)
|
||||
if matches:
|
||||
result_list.append(matches[0].strip())
|
||||
|
||||
# If we got all paragraphs, return them
|
||||
if len(result_list) == plist_len:
|
||||
return result_list
|
||||
|
||||
# Fallback: try general tag pattern
|
||||
pattern = r"<p(\d+)>(.*?)</p\1>"
|
||||
matches = re.findall(pattern, text, re.DOTALL)
|
||||
|
||||
if matches and len(matches) == plist_len:
|
||||
# Sort by paragraph number
|
||||
matches.sort(key=lambda x: int(x[0]))
|
||||
result_list = [match[1].strip() for match in matches]
|
||||
return result_list
|
||||
|
||||
# Second fallback: try another approach with numbered paragraphs
|
||||
result_list = []
|
||||
for i in range(1, plist_len + 1):
|
||||
pattern = rf"\({i}\)\s*(.*?)(?=\s*\({i + 1}\)|\Z)"
|
||||
match = re.search(pattern, text, re.DOTALL)
|
||||
if match:
|
||||
result_list.append(match.group(1).strip())
|
||||
|
||||
# If all else fails, fall back to splitting by lines
|
||||
if len(result_list) != plist_len:
|
||||
lines = text.splitlines()
|
||||
non_empty_lines = [line.strip() for line in lines if line.strip()]
|
||||
|
||||
# Attempt to find paragraph markers and divide accordingly
|
||||
paragraph_markers = [
|
||||
i
|
||||
for i, line in enumerate(non_empty_lines)
|
||||
if re.match(r"^\s*(\(\d+\)|\d+\.)", line)
|
||||
]
|
||||
|
||||
if len(paragraph_markers) == plist_len:
|
||||
result_list = []
|
||||
for i in range(len(paragraph_markers)):
|
||||
start = paragraph_markers[i]
|
||||
end = (
|
||||
paragraph_markers[i + 1]
|
||||
if i < len(paragraph_markers) - 1
|
||||
else len(non_empty_lines)
|
||||
)
|
||||
paragraph = " ".join(non_empty_lines[start:end])
|
||||
result_list.append(re.sub(r"^\s*(\(\d+\)|\d+\.)\s*", "", paragraph))
|
||||
else:
|
||||
# Last resort: try to split evenly
|
||||
result_list = (
|
||||
non_empty_lines[:plist_len]
|
||||
if len(non_empty_lines) >= plist_len
|
||||
else non_empty_lines
|
||||
)
|
||||
|
||||
# Remove paragraph numbers from the result
|
||||
result_list = [re.sub(r"^(\(\d+\)|\d+\.|(\d+))\s*", "", s) for s in result_list]
|
||||
return result_list
|
||||
|
||||
def extract_paragraphs(self, text, paragraph_count):
|
||||
|
Loading…
x
Reference in New Issue
Block a user