mirror of
https://github.com/tcsenpai/pensieve.git
synced 2025-06-10 04:57:12 +00:00
feat(indexing): support facet
This commit is contained in:
parent
2cf75bee7f
commit
11e447bcbb
@ -8,6 +8,12 @@ from .schemas import (
|
|||||||
EntityIndexItem,
|
EntityIndexItem,
|
||||||
MetadataIndexItem,
|
MetadataIndexItem,
|
||||||
EntitySearchResult,
|
EntitySearchResult,
|
||||||
|
SearchResult,
|
||||||
|
Facet,
|
||||||
|
SearchHit,
|
||||||
|
TextMatchInfo,
|
||||||
|
HybridSearchInfo,
|
||||||
|
RequestParams,
|
||||||
)
|
)
|
||||||
from .config import TYPESENSE_COLLECTION_NAME
|
from .config import TYPESENSE_COLLECTION_NAME
|
||||||
|
|
||||||
@ -133,7 +139,9 @@ def upsert(client, entity):
|
|||||||
|
|
||||||
# Sync the entity data to Typesense
|
# Sync the entity data to Typesense
|
||||||
try:
|
try:
|
||||||
client.collections[TYPESENSE_COLLECTION_NAME].documents.upsert(entity_data.model_dump_json())
|
client.collections[TYPESENSE_COLLECTION_NAME].documents.upsert(
|
||||||
|
entity_data.model_dump_json()
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise Exception(
|
raise Exception(
|
||||||
f"Failed to sync entity to Typesense: {str(e)}",
|
f"Failed to sync entity to Typesense: {str(e)}",
|
||||||
@ -199,11 +207,13 @@ def search_entities(
|
|||||||
q: str,
|
q: str,
|
||||||
library_ids: List[int] = None,
|
library_ids: List[int] = None,
|
||||||
folder_ids: List[int] = None,
|
folder_ids: List[int] = None,
|
||||||
|
tags: List[str] = None,
|
||||||
|
created_dates: List[str] = None,
|
||||||
limit: int = 48,
|
limit: int = 48,
|
||||||
offset: int = 0,
|
offset: int = 0,
|
||||||
start: int = None,
|
start: int = None,
|
||||||
end: int = None,
|
end: int = None,
|
||||||
) -> List[EntitySearchResult]:
|
) -> SearchResult:
|
||||||
try:
|
try:
|
||||||
filter_by = []
|
filter_by = []
|
||||||
if library_ids:
|
if library_ids:
|
||||||
@ -212,6 +222,10 @@ def search_entities(
|
|||||||
filter_by.append(f"folder_id:[{','.join(map(str, folder_ids))}]")
|
filter_by.append(f"folder_id:[{','.join(map(str, folder_ids))}]")
|
||||||
if start is not None and end is not None:
|
if start is not None and end is not None:
|
||||||
filter_by.append(f"file_created_at:={start}..{end}")
|
filter_by.append(f"file_created_at:={start}..{end}")
|
||||||
|
if tags:
|
||||||
|
filter_by.append(f"tags:=[{','.join(tags)}]")
|
||||||
|
if created_dates:
|
||||||
|
filter_by.append(f"created_date:[{','.join(created_dates)}]")
|
||||||
|
|
||||||
filter_by_str = " && ".join(filter_by) if filter_by else ""
|
filter_by_str = " && ".join(filter_by) if filter_by else ""
|
||||||
search_parameters = {
|
search_parameters = {
|
||||||
@ -227,36 +241,59 @@ def search_entities(
|
|||||||
"offset": offset,
|
"offset": offset,
|
||||||
"exclude_fields": "metadata_text,embedding",
|
"exclude_fields": "metadata_text,embedding",
|
||||||
"sort_by": "_text_match:desc",
|
"sort_by": "_text_match:desc",
|
||||||
|
"facet_by": "created_date,created_month,created_year,tags",
|
||||||
}
|
}
|
||||||
|
|
||||||
search_results = client.collections[TYPESENSE_COLLECTION_NAME].documents.search(
|
search_results = client.collections[TYPESENSE_COLLECTION_NAME].documents.search(
|
||||||
search_parameters
|
search_parameters
|
||||||
)
|
)
|
||||||
return [
|
|
||||||
EntitySearchResult(
|
hits = [
|
||||||
id=hit["document"]["id"],
|
SearchHit(
|
||||||
filepath=hit["document"]["filepath"],
|
document=EntitySearchResult(
|
||||||
filename=hit["document"]["filename"],
|
id=hit["document"]["id"],
|
||||||
size=hit["document"]["size"],
|
filepath=hit["document"]["filepath"],
|
||||||
file_created_at=hit["document"]["file_created_at"],
|
filename=hit["document"]["filename"],
|
||||||
file_last_modified_at=hit["document"]["file_last_modified_at"],
|
size=hit["document"]["size"],
|
||||||
file_type=hit["document"]["file_type"],
|
file_created_at=hit["document"]["file_created_at"],
|
||||||
file_type_group=hit["document"]["file_type_group"],
|
file_last_modified_at=hit["document"]["file_last_modified_at"],
|
||||||
last_scan_at=hit["document"].get("last_scan_at"),
|
file_type=hit["document"]["file_type"],
|
||||||
library_id=hit["document"]["library_id"],
|
file_type_group=hit["document"]["file_type_group"],
|
||||||
folder_id=hit["document"]["folder_id"],
|
last_scan_at=hit["document"].get("last_scan_at"),
|
||||||
tags=hit["document"]["tags"],
|
library_id=hit["document"]["library_id"],
|
||||||
metadata_entries=[
|
folder_id=hit["document"]["folder_id"],
|
||||||
MetadataIndexItem(
|
tags=hit["document"]["tags"],
|
||||||
key=entry["key"], value=entry["value"], source=entry["source"]
|
metadata_entries=[
|
||||||
)
|
MetadataIndexItem(
|
||||||
for entry in hit["document"]["metadata_entries"]
|
key=entry["key"],
|
||||||
],
|
value=entry["value"],
|
||||||
created_date=hit["document"]["created_date"],
|
source=entry["source"],
|
||||||
created_month=hit["document"]["created_month"],
|
)
|
||||||
created_year=hit["document"]["created_year"],
|
for entry in hit["document"]["metadata_entries"]
|
||||||
|
],
|
||||||
|
created_date=hit["document"].get("created_date"),
|
||||||
|
created_month=hit["document"].get("created_month"),
|
||||||
|
created_year=hit["document"].get("created_year"),
|
||||||
|
),
|
||||||
|
highlight=hit.get("highlight", {}),
|
||||||
|
highlights=hit.get("highlights", []),
|
||||||
|
hybrid_search_info=HybridSearchInfo(**hit["hybrid_search_info"]),
|
||||||
|
text_match=hit["text_match"],
|
||||||
|
text_match_info=TextMatchInfo(**hit["text_match_info"]),
|
||||||
)
|
)
|
||||||
for hit in search_results["hits"]
|
for hit in search_results["hits"]
|
||||||
]
|
]
|
||||||
|
|
||||||
|
return SearchResult(
|
||||||
|
facet_counts=[Facet(**facet) for facet in search_results["facet_counts"]],
|
||||||
|
found=search_results["found"],
|
||||||
|
hits=hits,
|
||||||
|
out_of=search_results["out_of"],
|
||||||
|
page=search_results["page"],
|
||||||
|
request_params=RequestParams(**search_results["request_params"]),
|
||||||
|
search_cutoff=search_results["search_cutoff"],
|
||||||
|
search_time_ms=search_results["search_time_ms"],
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise Exception(
|
raise Exception(
|
||||||
f"Failed to search entities: {str(e)}",
|
f"Failed to search entities: {str(e)}",
|
||||||
@ -265,7 +302,9 @@ def search_entities(
|
|||||||
|
|
||||||
def fetch_entity_by_id(client, id: str) -> EntityIndexItem:
|
def fetch_entity_by_id(client, id: str) -> EntityIndexItem:
|
||||||
try:
|
try:
|
||||||
document = client.collections[TYPESENSE_COLLECTION_NAME].documents[id].retrieve()
|
document = (
|
||||||
|
client.collections[TYPESENSE_COLLECTION_NAME].documents[id].retrieve()
|
||||||
|
)
|
||||||
return EntitySearchResult(
|
return EntitySearchResult(
|
||||||
id=document["id"],
|
id=document["id"],
|
||||||
filepath=document["filepath"],
|
filepath=document["filepath"],
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
from pydantic import BaseModel, ConfigDict, DirectoryPath, HttpUrl, Field
|
from pydantic import BaseModel, ConfigDict, DirectoryPath, HttpUrl, Field
|
||||||
from typing import List, Optional, Any
|
from typing import List, Optional, Any, Dict
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
|
|
||||||
@ -204,4 +204,56 @@ class EntitySearchResult(BaseModel):
|
|||||||
library_id: int
|
library_id: int
|
||||||
folder_id: int
|
folder_id: int
|
||||||
tags: List[str]
|
tags: List[str]
|
||||||
metadata_entries: List[MetadataIndexItem]
|
metadata_entries: List[MetadataIndexItem]
|
||||||
|
facets: Optional[Dict[str, Any]] = None
|
||||||
|
|
||||||
|
|
||||||
|
class FacetCount(BaseModel):
|
||||||
|
count: int
|
||||||
|
highlighted: str
|
||||||
|
value: str
|
||||||
|
|
||||||
|
class FacetStats(BaseModel):
|
||||||
|
total_values: int
|
||||||
|
|
||||||
|
class Facet(BaseModel):
|
||||||
|
counts: List[FacetCount]
|
||||||
|
field_name: str
|
||||||
|
sampled: bool
|
||||||
|
stats: FacetStats
|
||||||
|
|
||||||
|
class TextMatchInfo(BaseModel):
|
||||||
|
best_field_score: str
|
||||||
|
best_field_weight: int
|
||||||
|
fields_matched: int
|
||||||
|
num_tokens_dropped: int
|
||||||
|
score: str
|
||||||
|
tokens_matched: int
|
||||||
|
typo_prefix_score: int
|
||||||
|
|
||||||
|
class HybridSearchInfo(BaseModel):
|
||||||
|
rank_fusion_score: float
|
||||||
|
|
||||||
|
class SearchHit(BaseModel):
|
||||||
|
document: EntitySearchResult
|
||||||
|
highlight: Dict[str, Any] = {}
|
||||||
|
highlights: List[Any] = []
|
||||||
|
hybrid_search_info: HybridSearchInfo
|
||||||
|
text_match: int
|
||||||
|
text_match_info: TextMatchInfo
|
||||||
|
|
||||||
|
class RequestParams(BaseModel):
|
||||||
|
collection_name: str
|
||||||
|
first_q: str
|
||||||
|
per_page: int
|
||||||
|
q: str
|
||||||
|
|
||||||
|
class SearchResult(BaseModel):
|
||||||
|
facet_counts: List[Facet]
|
||||||
|
found: int
|
||||||
|
hits: List[SearchHit]
|
||||||
|
out_of: int
|
||||||
|
page: int
|
||||||
|
request_params: RequestParams
|
||||||
|
search_cutoff: bool
|
||||||
|
search_time_ms: int
|
@ -39,6 +39,7 @@ from .schemas import (
|
|||||||
EntityIndexItem,
|
EntityIndexItem,
|
||||||
MetadataIndexItem,
|
MetadataIndexItem,
|
||||||
EntitySearchResult,
|
EntitySearchResult,
|
||||||
|
SearchResult
|
||||||
)
|
)
|
||||||
|
|
||||||
# Import the logging configuration
|
# Import the logging configuration
|
||||||
@ -439,24 +440,26 @@ def list_entitiy_indices_in_folder(
|
|||||||
return indexing.list_all_entities(client, library_id, folder_id, limit, offset)
|
return indexing.list_all_entities(client, library_id, folder_id, limit, offset)
|
||||||
|
|
||||||
|
|
||||||
@app.get("/search", response_model=List[EntitySearchResult], tags=["search"])
|
@app.get("/search", response_model=SearchResult, tags=["search"])
|
||||||
async def search_entities(
|
async def search_entities(
|
||||||
q: str,
|
q: str,
|
||||||
library_ids: str = Query(None, description="Comma-separated list of library IDs"),
|
library_ids: str = Query(None, description="Comma-separated list of library IDs"),
|
||||||
folder_ids: str = Query(None, description="Comma-separated list of folder IDs"),
|
folder_ids: str = Query(None, description="Comma-separated list of folder IDs"),
|
||||||
|
tags: str = Query(None, description="Comma-separated list of tags"),
|
||||||
|
created_dates: str = Query(None, description="Comma-separated list of created dates in YYYY-MM-DD format"),
|
||||||
limit: Annotated[int, Query(ge=1, le=200)] = 48,
|
limit: Annotated[int, Query(ge=1, le=200)] = 48,
|
||||||
offset: int = 0,
|
offset: int = 0,
|
||||||
start: int = None,
|
start: int = None,
|
||||||
end: int = None,
|
end: int = None,
|
||||||
db: Session = Depends(get_db),
|
db: Session = Depends(get_db),
|
||||||
):
|
):
|
||||||
library_ids = (
|
library_ids = [int(id) for id in library_ids.split(",")] if library_ids else None
|
||||||
[int(id) for id in library_ids.split(",") if id] if library_ids else None
|
folder_ids = [int(id) for id in folder_ids.split(",")] if folder_ids else None
|
||||||
)
|
tags = [tag.strip() for tag in tags.split(",")] if tags else None
|
||||||
folder_ids = [int(id) for id in folder_ids.split(",") if id] if folder_ids else None
|
created_dates = [date.strip() for date in created_dates.split(",")] if created_dates else None
|
||||||
try:
|
try:
|
||||||
return indexing.search_entities(
|
return indexing.search_entities(
|
||||||
client, q, library_ids, folder_ids, limit, offset, start, end
|
client, q, library_ids, folder_ids, tags, created_dates, limit, offset, start, end
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error searching entities: {e}")
|
print(f"Error searching entities: {e}")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user