fix(index): ignore empty text for embedding

This commit is contained in:
arkohut 2024-08-29 21:38:01 +08:00
parent 0d0f14c526
commit 1a08a44a4d

View File

@ -104,12 +104,15 @@ def generate_metadata_text(metadata_entries):
def bulk_upsert(client, entities): def bulk_upsert(client, entities):
documents = [] documents = []
metadata_texts = [] metadata_texts = []
entities_with_metadata = []
for entity in entities: for entity in entities:
metadata_text = generate_metadata_text(entity.metadata_entries) metadata_text = generate_metadata_text(entity.metadata_entries)
print(f"metadata_text: {len(metadata_text)}") print(f"metadata_text: {len(metadata_text)}")
metadata_texts.append(metadata_text) if metadata_text:
metadata_texts.append(metadata_text)
entities_with_metadata.append(entity)
documents.append( documents.append(
EntityIndexItem( EntityIndexItem(
id=str(entity.id), id=str(entity.id),
@ -141,12 +144,10 @@ def bulk_upsert(client, entities):
).model_dump(mode="json") ).model_dump(mode="json")
) )
# 批量获取嵌入向量
print(f"Getting embeddings for {len(metadata_texts)} texts")
embeddings = get_embeddings(metadata_texts) embeddings = get_embeddings(metadata_texts)
# 将嵌入向量添加到文档中 for doc, embedding, entity in zip(documents, embeddings, entities):
for doc, embedding in zip(documents, embeddings): if entity in entities_with_metadata:
doc["embedding"] = embedding doc["embedding"] = embedding
# Sync the entity data to Typesense # Sync the entity data to Typesense
try: try: