From a9f34b09d41c6a299a9c070bc2e2fc0aa78c0d47 Mon Sep 17 00:00:00 2001 From: arkohut <39525455+arkohut@users.noreply.github.com> Date: Tue, 3 Sep 2024 18:36:08 +0800 Subject: [PATCH] feat(index): do not add ocr for embedding --- memos/indexing.py | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/memos/indexing.py b/memos/indexing.py index 80eabba..9534388 100644 --- a/memos/indexing.py +++ b/memos/indexing.py @@ -54,6 +54,7 @@ def get_embeddings(texts: List[str]) -> List[List[float]]: response = client.post( f"{ollama_endpoint}/api/embed", json={"model": ollama_model, "input": texts}, + timeout=30 ) if response.status_code == 200: print("Successfully retrieved embeddings from the embedding service.") @@ -68,6 +69,7 @@ def get_embeddings(texts: List[str]) -> List[List[float]]: def generate_metadata_text(metadata_entries): + # 暂时不使用ocr结果 def process_ocr_result(metadata): try: ocr_data = json.loads(metadata.value) @@ -84,21 +86,17 @@ def generate_metadata_text(metadata_entries): except json.JSONDecodeError: return metadata.value - return "\n\n".join( - [ - ( - f"key: {metadata.key}\nvalue:\n{process_ocr_result(metadata)}" - if metadata.key == "ocr_result" - and metadata.data_type == MetadataType.JSON_DATA - else ( - f"key: {metadata.key}\nvalue:\n{json.dumps(json.loads(metadata.value), indent=2)}" - if metadata.data_type == MetadataType.JSON_DATA - else f"key: {metadata.key}\nvalue:\n{metadata.value}" - ) - ) - for metadata in metadata_entries - ] - ) + non_ocr_metadata = [ + ( + f"key: {metadata.key}\nvalue:\n{json.dumps(json.loads(metadata.value), indent=2)}" + if metadata.data_type == MetadataType.JSON_DATA + else f"key: {metadata.key}\nvalue:\n{metadata.value}" + ) + for metadata in metadata_entries + if metadata.key != "ocr_result" + ] + metadata_text = "\n\n".join(non_ocr_metadata) + return metadata_text def bulk_upsert(client, entities):