mirror of
https://github.com/yihong0618/bilingual_book_maker.git
synced 2025-06-05 19:15:34 +00:00
fix:Fix translation paragraph count mismatch by explicitly instructing LLM about paragraph requirements
This commit is contained in:
parent
cc4f4c4dae
commit
c780f7c516
@ -75,7 +75,7 @@ class ChatGPTAPI(Base):
|
|||||||
api_base=None,
|
api_base=None,
|
||||||
prompt_template=None,
|
prompt_template=None,
|
||||||
prompt_sys_msg=None,
|
prompt_sys_msg=None,
|
||||||
temperature=1.3,
|
temperature=1.0,
|
||||||
context_flag=False,
|
context_flag=False,
|
||||||
context_paragraph_limit=0,
|
context_paragraph_limit=0,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
@ -301,22 +301,20 @@ class ChatGPTAPI(Base):
|
|||||||
def translate_list(self, plist):
|
def translate_list(self, plist):
|
||||||
plist_len = len(plist)
|
plist_len = len(plist)
|
||||||
|
|
||||||
# 创建原始文本列表,并为每个段落添加明确的编号标记
|
# Create a list of original texts and add clear numbering markers to each paragraph
|
||||||
formatted_text = ""
|
formatted_text = ""
|
||||||
for i, p in enumerate(plist, 1):
|
for i, p in enumerate(plist, 1):
|
||||||
temp_p = copy(p)
|
temp_p = copy(p)
|
||||||
for sup in temp_p.find_all("sup"):
|
for sup in temp_p.find_all("sup"):
|
||||||
sup.extract()
|
sup.extract()
|
||||||
para_text = temp_p.get_text().strip()
|
para_text = temp_p.get_text().strip()
|
||||||
# 使用特殊的分隔符和明确的编号
|
# Using special delimiters and clear numbering
|
||||||
formatted_text += f"PARAGRAPH {i}:\n{para_text}\n\n"
|
formatted_text += f"PARAGRAPH {i}:\n{para_text}\n\n"
|
||||||
|
|
||||||
print(f"plist len = {plist_len}")
|
print(f"plist len = {plist_len}")
|
||||||
|
|
||||||
# 保存原始提示模板
|
|
||||||
original_prompt_template = self.prompt_template
|
original_prompt_template = self.prompt_template
|
||||||
|
|
||||||
# 创建明确要求保持段落结构的提示
|
|
||||||
structured_prompt = (
|
structured_prompt = (
|
||||||
f"Translate the following {plist_len} paragraphs to {{language}}. "
|
f"Translate the following {plist_len} paragraphs to {{language}}. "
|
||||||
f"CRUCIAL INSTRUCTION: Format your response using EXACTLY this structure:\n\n"
|
f"CRUCIAL INSTRUCTION: Format your response using EXACTLY this structure:\n\n"
|
||||||
@ -332,10 +330,9 @@ class ChatGPTAPI(Base):
|
|||||||
|
|
||||||
self.prompt_template = structured_prompt + " ```{text}```"
|
self.prompt_template = structured_prompt + " ```{text}```"
|
||||||
|
|
||||||
# 翻译
|
|
||||||
translated_text = self.translate(formatted_text, False)
|
translated_text = self.translate(formatted_text, False)
|
||||||
|
|
||||||
# 从结构化输出中提取翻译
|
# Extract translations from structured output
|
||||||
translated_paragraphs = []
|
translated_paragraphs = []
|
||||||
for i in range(1, plist_len + 1):
|
for i in range(1, plist_len + 1):
|
||||||
pattern = (
|
pattern = (
|
||||||
@ -350,7 +347,6 @@ class ChatGPTAPI(Base):
|
|||||||
translated_paragraphs.append(translated_paragraph)
|
translated_paragraphs.append(translated_paragraph)
|
||||||
else:
|
else:
|
||||||
print(f"Warning: Could not find translation for paragraph {i}")
|
print(f"Warning: Could not find translation for paragraph {i}")
|
||||||
# 尝试更宽松的匹配
|
|
||||||
loose_pattern = (
|
loose_pattern = (
|
||||||
r"(?:TRANSLATION|PARAGRAPH|PARA).*?"
|
r"(?:TRANSLATION|PARAGRAPH|PARA).*?"
|
||||||
+ str(i)
|
+ str(i)
|
||||||
@ -362,20 +358,19 @@ class ChatGPTAPI(Base):
|
|||||||
else:
|
else:
|
||||||
translated_paragraphs.append("")
|
translated_paragraphs.append("")
|
||||||
|
|
||||||
# 恢复原始提示
|
|
||||||
self.prompt_template = original_prompt_template
|
self.prompt_template = original_prompt_template
|
||||||
|
|
||||||
# 如果提取到的段落数不正确,尝试备用提取方法
|
# If the number of extracted paragraphs is incorrect, try the alternative extraction method.
|
||||||
if len(translated_paragraphs) != plist_len:
|
if len(translated_paragraphs) != plist_len:
|
||||||
print(
|
print(
|
||||||
f"Warning: Extracted {len(translated_paragraphs)}/{plist_len} paragraphs. Using fallback extraction."
|
f"Warning: Extracted {len(translated_paragraphs)}/{plist_len} paragraphs. Using fallback extraction."
|
||||||
)
|
)
|
||||||
# 提取所有可能的段落标记
|
|
||||||
all_para_pattern = r"(?:TRANSLATION|PARAGRAPH|PARA).*?(\d+).*?:(.*?)(?=(?:TRANSLATION|PARAGRAPH|PARA).*?\d+.*?:|\Z)"
|
all_para_pattern = r"(?:TRANSLATION|PARAGRAPH|PARA).*?(\d+).*?:(.*?)(?=(?:TRANSLATION|PARAGRAPH|PARA).*?\d+.*?:|\Z)"
|
||||||
all_matches = re.findall(all_para_pattern, translated_text, re.DOTALL)
|
all_matches = re.findall(all_para_pattern, translated_text, re.DOTALL)
|
||||||
|
|
||||||
if all_matches:
|
if all_matches:
|
||||||
# 创建一个字典,根据段落编号映射翻译内容
|
# Create a dictionary to map translation content based on paragraph numbers
|
||||||
para_dict = {}
|
para_dict = {}
|
||||||
for num_str, content in all_matches:
|
for num_str, content in all_matches:
|
||||||
try:
|
try:
|
||||||
@ -385,7 +380,7 @@ class ChatGPTAPI(Base):
|
|||||||
except ValueError:
|
except ValueError:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 按原始顺序重建翻译列表
|
# Rebuild the translation list in the original order
|
||||||
new_translated_paragraphs = []
|
new_translated_paragraphs = []
|
||||||
for i in range(1, plist_len + 1):
|
for i in range(1, plist_len + 1):
|
||||||
if i in para_dict:
|
if i in para_dict:
|
||||||
@ -396,7 +391,6 @@ class ChatGPTAPI(Base):
|
|||||||
if len(new_translated_paragraphs) == plist_len:
|
if len(new_translated_paragraphs) == plist_len:
|
||||||
translated_paragraphs = new_translated_paragraphs
|
translated_paragraphs = new_translated_paragraphs
|
||||||
|
|
||||||
# 确保最终有正确数量的段落
|
|
||||||
if len(translated_paragraphs) < plist_len:
|
if len(translated_paragraphs) < plist_len:
|
||||||
translated_paragraphs.extend(
|
translated_paragraphs.extend(
|
||||||
[""] * (plist_len - len(translated_paragraphs))
|
[""] * (plist_len - len(translated_paragraphs))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user