fix(index): ignore empty text for embedding

This commit is contained in:
arkohut 2024-08-29 21:38:01 +08:00
parent 0d0f14c526
commit 1a08a44a4d

View File

@ -104,11 +104,14 @@ def generate_metadata_text(metadata_entries):
def bulk_upsert(client, entities): def bulk_upsert(client, entities):
documents = [] documents = []
metadata_texts = [] metadata_texts = []
entities_with_metadata = []
for entity in entities: for entity in entities:
metadata_text = generate_metadata_text(entity.metadata_entries) metadata_text = generate_metadata_text(entity.metadata_entries)
print(f"metadata_text: {len(metadata_text)}") print(f"metadata_text: {len(metadata_text)}")
if metadata_text:
metadata_texts.append(metadata_text) metadata_texts.append(metadata_text)
entities_with_metadata.append(entity)
documents.append( documents.append(
EntityIndexItem( EntityIndexItem(
@ -141,11 +144,9 @@ def bulk_upsert(client, entities):
).model_dump(mode="json") ).model_dump(mode="json")
) )
# 批量获取嵌入向量
print(f"Getting embeddings for {len(metadata_texts)} texts")
embeddings = get_embeddings(metadata_texts) embeddings = get_embeddings(metadata_texts)
# 将嵌入向量添加到文档中 for doc, embedding, entity in zip(documents, embeddings, entities):
for doc, embedding in zip(documents, embeddings): if entity in entities_with_metadata:
doc["embedding"] = embedding doc["embedding"] = embedding
# Sync the entity data to Typesense # Sync the entity data to Typesense