feat: add index

This commit is contained in:
arkohut 2024-06-21 18:47:01 +08:00
parent 7c3017f85b
commit 8a1e04f997
4 changed files with 237 additions and 29 deletions

View File

@ -23,6 +23,7 @@ BASE_URL = "http://localhost:8080"
ignore_files = [".DS_Store"]
def format_timestamp(timestamp):
if isinstance(timestamp, str):
return timestamp
@ -55,7 +56,9 @@ def display_libraries(libraries):
]
)
print(tabulate(table, headers=["ID", "Name", "Folders", "Plugins"], tablefmt="plain"))
print(
tabulate(table, headers=["ID", "Name", "Folders", "Plugins"], tablefmt="plain")
)
@app.command()
@ -253,6 +256,91 @@ def scan(library_id: int):
print(f"Total files deleted: {total_files_deleted}")
@lib_app.command("index")
def index(library_id: int):
print(f"Indexing library {library_id}")
# Get the library
response = httpx.get(f"{BASE_URL}/libraries/{library_id}")
if response.status_code != 200:
print(f"Failed to get library: {response.status_code} - {response.text}")
return
library = response.json()
scanned_entities = set()
# Iterate through folders
for folder in library["folders"]:
tqdm.write(f"Processing folder: {folder['id']}")
# List all entities in the folder
offset = 0
while True:
entities_response = httpx.get(
f"{BASE_URL}/libraries/{library_id}/folders/{folder['id']}/entities",
params={"limit": 200, "offset": offset},
)
if entities_response.status_code != 200:
tqdm.write(
f"Failed to get entities: {entities_response.status_code} - {entities_response.text}"
)
break
entities = entities_response.json()
if not entities:
break
# Index each entity
for entity in tqdm(entities, desc="Indexing entities", leave=False):
index_response = httpx.post(f"{BASE_URL}/entities/{entity['id']}/index")
if index_response.status_code == 204:
tqdm.write(f"Indexed entity: {entity['id']}")
else:
tqdm.write(
f"Failed to index entity {entity['id']}: {index_response.status_code} - {index_response.text}"
)
scanned_entities.add(str(entity["id"]))
offset += 200
# List all indexed entities in the folder
offset = 0
while True:
index_response = httpx.get(
f"{BASE_URL}/libraries/{library_id}/folders/{folder['id']}/index",
params={"limit": 200, "offset": offset},
)
if index_response.status_code != 200:
tqdm.write(
f"Failed to get indexed entities: {index_response.status_code} - {index_response.text}"
)
break
indexed_entities = index_response.json()
if not indexed_entities:
break
# Delete indexes for entities not in scanned_entities
for indexed_entity in tqdm(
indexed_entities, desc="Cleaning up indexes", leave=False
):
if indexed_entity["id"] not in scanned_entities:
delete_response = httpx.delete(
f"{BASE_URL}/entities/{indexed_entity['id']}/index"
)
if delete_response.status_code == 204:
tqdm.write(f"Deleted index for entity: {indexed_entity['id']}")
else:
tqdm.write(
f"Failed to delete index for entity {indexed_entity['id']}: {delete_response.status_code} - {delete_response.text}"
)
offset += 200
print("Indexing completed")
def display_plugins(plugins):
table = []
for plugin in plugins:

View File

@ -1,6 +1,7 @@
import json
from typing import List
from .schemas import MetadataType, EntityMetadata
from .schemas import MetadataType, EntityMetadata, EntityIndexItem, MetadataIndexItem
def convert_metadata_value(metadata: EntityMetadata):
@ -16,31 +17,28 @@ def convert_metadata_value(metadata: EntityMetadata):
def upsert(client, entity):
# Prepare the entity data for Typesense
entity_data = {
"id": str(entity.id),
"filepath": entity.filepath,
"filename": entity.filename,
"size": entity.size,
"file_created_at": int(entity.file_created_at.timestamp()),
"file_last_modified_at": int(entity.file_last_modified_at.timestamp()),
"file_type": entity.file_type,
"file_type_group": entity.file_type_group,
"last_scan_at": (
int(entity.last_scan_at.timestamp()) if entity.last_scan_at else None
),
"library_id": entity.library_id,
"folder_id": entity.folder_id,
"tags": [tag.name for tag in entity.tags],
"metadata_entries": [
{
"key": metadata.key,
"value": convert_metadata_value(metadata),
"source": metadata.source,
}
entity_data = EntityIndexItem(
id=str(entity.id),
filepath=entity.filepath,
filename=entity.filename,
size=entity.size,
file_created_at=int(entity.file_created_at.timestamp()),
file_last_modified_at=int(entity.file_last_modified_at.timestamp()),
file_type=entity.file_type,
file_type_group=entity.file_type_group,
last_scan_at=int(entity.last_scan_at.timestamp()) if entity.last_scan_at else None,
library_id=entity.library_id,
folder_id=entity.folder_id,
tags=[tag.name for tag in entity.tags],
metadata_entries=[
MetadataIndexItem(
key=metadata.key,
value=convert_metadata_value(metadata),
source=metadata.source,
)
for metadata in entity.metadata_entries
],
"metadata_text": "\n\n".join(
metadata_text="\n\n".join(
[
(
f"key: {metadata.key}\nvalue:\n{json.dumps(json.loads(metadata.value), indent=2)}"
@ -50,12 +48,61 @@ def upsert(client, entity):
for metadata in entity.metadata_entries
]
),
}
)
# Sync the entity data to Typesense
try:
client.collections["entities"].documents.upsert(entity_data)
client.collections["entities"].documents.upsert(entity_data.model_dump_json())
except Exception as e:
raise Exception(
f"Failed to sync entity to Typesense: {str(e)}",
)
def remove_entity_by_id(client, entity_id):
try:
client.collections["entities"].documents[entity_id].delete()
except Exception as e:
raise Exception(
f"Failed to remove entity from Typesense: {str(e)}",
)
def list_all_entities(client, library_id: int, folder_id: int, limit=100, offset=0) -> List[EntityIndexItem]:
try:
response = client.collections["entities"].documents.search(
{
"q": "*",
"filter_by": f"library_id:={library_id} && folder_id:={folder_id}",
"per_page": limit,
"page": offset // limit + 1,
}
)
return [
EntityIndexItem(
id=hit["document"]["id"],
filepath=hit["document"]["filepath"],
filename=hit["document"]["filename"],
size=hit["document"]["size"],
file_created_at=hit["document"]["file_created_at"],
file_last_modified_at=hit["document"]["file_last_modified_at"],
file_type=hit["document"]["file_type"],
file_type_group=hit["document"]["file_type_group"],
last_scan_at=hit["document"].get("last_scan_at"),
library_id=hit["document"]["library_id"],
folder_id=hit["document"]["folder_id"],
tags=hit["document"]["tags"],
metadata_entries=[
MetadataIndexItem(
key=entry["key"],
value=entry["value"],
source=entry["source"]
) for entry in hit["document"]["metadata_entries"]
],
metadata_text=hit["document"]["metadata_text"]
) for hit in response["hits"]
]
except Exception as e:
raise Exception(
f"Failed to list entities for library {library_id} and folder {folder_id}: {str(e)}",
)

View File

@ -1,5 +1,5 @@
from pydantic import BaseModel, ConfigDict, DirectoryPath, HttpUrl
from typing import List
from pydantic import BaseModel, ConfigDict, DirectoryPath, HttpUrl, Field
from typing import List, Optional, Any
from datetime import datetime
from enum import Enum
@ -140,3 +140,25 @@ class Entity(BaseModel):
model_config = ConfigDict(from_attributes=True)
class MetadataIndexItem(BaseModel):
key: str
value: Any
source: str
class EntityIndexItem(BaseModel):
id: str
filepath: str
filename: str
size: int
file_created_at: int = Field(..., description="Unix timestamp")
file_last_modified_at: int = Field(..., description="Unix timestamp")
file_type: str
file_type_group: str
last_scan_at: Optional[int] = Field(None, description="Unix timestamp")
library_id: int
folder_id: int
tags: List[str]
metadata_entries: List[MetadataIndexItem]
metadata_text: str

View File

@ -28,6 +28,8 @@ from .schemas import (
UpdateEntityTagsParam,
UpdateEntityMetadataParam,
MetadataType,
EntityIndexItem,
MetadataIndexItem,
)
engine = create_engine(f"sqlite:///{get_database_path()}")
@ -280,6 +282,55 @@ async def sync_entity_to_typesense(entity_id: int, db: Session = Depends(get_db)
return None
@app.delete(
"/entities/{entity_id}/index",
status_code=status.HTTP_204_NO_CONTENT,
tags=["entity"],
)
async def remove_entity_from_typesense(entity_id: int, db: Session = Depends(get_db)):
entity = crud.get_entity_by_id(entity_id, db)
if entity is None:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Entity not found",
)
try:
indexing.remove_entity_by_id(client, entity_id)
except Exception as e:
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=str(e),
)
return None
@app.get(
"/libraries/{library_id}/folders/{folder_id}/index",
response_model=List[EntityIndexItem],
tags=["entity"],
)
def list_entities_in_folder(
library_id: int,
folder_id: int,
limit: Annotated[int, Query(ge=1, le=200)] = 10,
offset: int = 0,
db: Session = Depends(get_db),
):
library = crud.get_library_by_id(library_id, db)
if library is None:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND, detail="Library not found"
)
if folder_id not in [folder.id for folder in library.folders]:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Folder not found in the specified library",
)
return indexing.list_all_entities(client, library_id, folder_id, limit, offset)
@app.patch("/entities/{entity_id}/tags", response_model=Entity, tags=["entity"])
@app.put("/entities/{entity_id}/tags", response_model=Entity, tags=["entity"])
def patch_entity_tags(