mirror of
https://github.com/tcsenpai/pensieve.git
synced 2025-06-09 04:35:26 +00:00
feat(index): do not add ocr for embedding
This commit is contained in:
parent
e99792a974
commit
a9f34b09d4
@ -54,6 +54,7 @@ def get_embeddings(texts: List[str]) -> List[List[float]]:
|
|||||||
response = client.post(
|
response = client.post(
|
||||||
f"{ollama_endpoint}/api/embed",
|
f"{ollama_endpoint}/api/embed",
|
||||||
json={"model": ollama_model, "input": texts},
|
json={"model": ollama_model, "input": texts},
|
||||||
|
timeout=30
|
||||||
)
|
)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
print("Successfully retrieved embeddings from the embedding service.")
|
print("Successfully retrieved embeddings from the embedding service.")
|
||||||
@ -68,6 +69,7 @@ def get_embeddings(texts: List[str]) -> List[List[float]]:
|
|||||||
|
|
||||||
|
|
||||||
def generate_metadata_text(metadata_entries):
|
def generate_metadata_text(metadata_entries):
|
||||||
|
# 暂时不使用ocr结果
|
||||||
def process_ocr_result(metadata):
|
def process_ocr_result(metadata):
|
||||||
try:
|
try:
|
||||||
ocr_data = json.loads(metadata.value)
|
ocr_data = json.loads(metadata.value)
|
||||||
@ -84,21 +86,17 @@ def generate_metadata_text(metadata_entries):
|
|||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
return metadata.value
|
return metadata.value
|
||||||
|
|
||||||
return "\n\n".join(
|
non_ocr_metadata = [
|
||||||
[
|
(
|
||||||
(
|
f"key: {metadata.key}\nvalue:\n{json.dumps(json.loads(metadata.value), indent=2)}"
|
||||||
f"key: {metadata.key}\nvalue:\n{process_ocr_result(metadata)}"
|
if metadata.data_type == MetadataType.JSON_DATA
|
||||||
if metadata.key == "ocr_result"
|
else f"key: {metadata.key}\nvalue:\n{metadata.value}"
|
||||||
and metadata.data_type == MetadataType.JSON_DATA
|
)
|
||||||
else (
|
for metadata in metadata_entries
|
||||||
f"key: {metadata.key}\nvalue:\n{json.dumps(json.loads(metadata.value), indent=2)}"
|
if metadata.key != "ocr_result"
|
||||||
if metadata.data_type == MetadataType.JSON_DATA
|
]
|
||||||
else f"key: {metadata.key}\nvalue:\n{metadata.value}"
|
metadata_text = "\n\n".join(non_ocr_metadata)
|
||||||
)
|
return metadata_text
|
||||||
)
|
|
||||||
for metadata in metadata_entries
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def bulk_upsert(client, entities):
|
def bulk_upsert(client, entities):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user