feat(index): do not add ocr for embedding

This commit is contained in:
arkohut 2024-09-03 18:36:08 +08:00
parent e99792a974
commit a9f34b09d4

View File

@ -54,6 +54,7 @@ def get_embeddings(texts: List[str]) -> List[List[float]]:
response = client.post( response = client.post(
f"{ollama_endpoint}/api/embed", f"{ollama_endpoint}/api/embed",
json={"model": ollama_model, "input": texts}, json={"model": ollama_model, "input": texts},
timeout=30
) )
if response.status_code == 200: if response.status_code == 200:
print("Successfully retrieved embeddings from the embedding service.") print("Successfully retrieved embeddings from the embedding service.")
@ -68,6 +69,7 @@ def get_embeddings(texts: List[str]) -> List[List[float]]:
def generate_metadata_text(metadata_entries): def generate_metadata_text(metadata_entries):
# 暂时不使用ocr结果
def process_ocr_result(metadata): def process_ocr_result(metadata):
try: try:
ocr_data = json.loads(metadata.value) ocr_data = json.loads(metadata.value)
@ -84,21 +86,17 @@ def generate_metadata_text(metadata_entries):
except json.JSONDecodeError: except json.JSONDecodeError:
return metadata.value return metadata.value
return "\n\n".join( non_ocr_metadata = [
[ (
( f"key: {metadata.key}\nvalue:\n{json.dumps(json.loads(metadata.value), indent=2)}"
f"key: {metadata.key}\nvalue:\n{process_ocr_result(metadata)}" if metadata.data_type == MetadataType.JSON_DATA
if metadata.key == "ocr_result" else f"key: {metadata.key}\nvalue:\n{metadata.value}"
and metadata.data_type == MetadataType.JSON_DATA )
else ( for metadata in metadata_entries
f"key: {metadata.key}\nvalue:\n{json.dumps(json.loads(metadata.value), indent=2)}" if metadata.key != "ocr_result"
if metadata.data_type == MetadataType.JSON_DATA ]
else f"key: {metadata.key}\nvalue:\n{metadata.value}" metadata_text = "\n\n".join(non_ocr_metadata)
) return metadata_text
)
for metadata in metadata_entries
]
)
def bulk_upsert(client, entities): def bulk_upsert(client, entities):