mirror of
https://github.com/tcsenpai/pensieve.git
synced 2025-06-08 12:15:26 +00:00
feat: add index
This commit is contained in:
parent
7c3017f85b
commit
8a1e04f997
@ -23,6 +23,7 @@ BASE_URL = "http://localhost:8080"
|
|||||||
|
|
||||||
ignore_files = [".DS_Store"]
|
ignore_files = [".DS_Store"]
|
||||||
|
|
||||||
|
|
||||||
def format_timestamp(timestamp):
|
def format_timestamp(timestamp):
|
||||||
if isinstance(timestamp, str):
|
if isinstance(timestamp, str):
|
||||||
return timestamp
|
return timestamp
|
||||||
@ -55,7 +56,9 @@ def display_libraries(libraries):
|
|||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
print(tabulate(table, headers=["ID", "Name", "Folders", "Plugins"], tablefmt="plain"))
|
print(
|
||||||
|
tabulate(table, headers=["ID", "Name", "Folders", "Plugins"], tablefmt="plain")
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@app.command()
|
@app.command()
|
||||||
@ -253,6 +256,91 @@ def scan(library_id: int):
|
|||||||
print(f"Total files deleted: {total_files_deleted}")
|
print(f"Total files deleted: {total_files_deleted}")
|
||||||
|
|
||||||
|
|
||||||
|
@lib_app.command("index")
|
||||||
|
def index(library_id: int):
|
||||||
|
print(f"Indexing library {library_id}")
|
||||||
|
|
||||||
|
# Get the library
|
||||||
|
response = httpx.get(f"{BASE_URL}/libraries/{library_id}")
|
||||||
|
if response.status_code != 200:
|
||||||
|
print(f"Failed to get library: {response.status_code} - {response.text}")
|
||||||
|
return
|
||||||
|
|
||||||
|
library = response.json()
|
||||||
|
scanned_entities = set()
|
||||||
|
|
||||||
|
# Iterate through folders
|
||||||
|
for folder in library["folders"]:
|
||||||
|
tqdm.write(f"Processing folder: {folder['id']}")
|
||||||
|
|
||||||
|
# List all entities in the folder
|
||||||
|
offset = 0
|
||||||
|
while True:
|
||||||
|
entities_response = httpx.get(
|
||||||
|
f"{BASE_URL}/libraries/{library_id}/folders/{folder['id']}/entities",
|
||||||
|
params={"limit": 200, "offset": offset},
|
||||||
|
)
|
||||||
|
if entities_response.status_code != 200:
|
||||||
|
tqdm.write(
|
||||||
|
f"Failed to get entities: {entities_response.status_code} - {entities_response.text}"
|
||||||
|
)
|
||||||
|
break
|
||||||
|
|
||||||
|
entities = entities_response.json()
|
||||||
|
if not entities:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Index each entity
|
||||||
|
for entity in tqdm(entities, desc="Indexing entities", leave=False):
|
||||||
|
index_response = httpx.post(f"{BASE_URL}/entities/{entity['id']}/index")
|
||||||
|
if index_response.status_code == 204:
|
||||||
|
tqdm.write(f"Indexed entity: {entity['id']}")
|
||||||
|
else:
|
||||||
|
tqdm.write(
|
||||||
|
f"Failed to index entity {entity['id']}: {index_response.status_code} - {index_response.text}"
|
||||||
|
)
|
||||||
|
|
||||||
|
scanned_entities.add(str(entity["id"]))
|
||||||
|
|
||||||
|
offset += 200
|
||||||
|
|
||||||
|
# List all indexed entities in the folder
|
||||||
|
offset = 0
|
||||||
|
while True:
|
||||||
|
index_response = httpx.get(
|
||||||
|
f"{BASE_URL}/libraries/{library_id}/folders/{folder['id']}/index",
|
||||||
|
params={"limit": 200, "offset": offset},
|
||||||
|
)
|
||||||
|
if index_response.status_code != 200:
|
||||||
|
tqdm.write(
|
||||||
|
f"Failed to get indexed entities: {index_response.status_code} - {index_response.text}"
|
||||||
|
)
|
||||||
|
break
|
||||||
|
|
||||||
|
indexed_entities = index_response.json()
|
||||||
|
if not indexed_entities:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Delete indexes for entities not in scanned_entities
|
||||||
|
for indexed_entity in tqdm(
|
||||||
|
indexed_entities, desc="Cleaning up indexes", leave=False
|
||||||
|
):
|
||||||
|
if indexed_entity["id"] not in scanned_entities:
|
||||||
|
delete_response = httpx.delete(
|
||||||
|
f"{BASE_URL}/entities/{indexed_entity['id']}/index"
|
||||||
|
)
|
||||||
|
if delete_response.status_code == 204:
|
||||||
|
tqdm.write(f"Deleted index for entity: {indexed_entity['id']}")
|
||||||
|
else:
|
||||||
|
tqdm.write(
|
||||||
|
f"Failed to delete index for entity {indexed_entity['id']}: {delete_response.status_code} - {delete_response.text}"
|
||||||
|
)
|
||||||
|
|
||||||
|
offset += 200
|
||||||
|
|
||||||
|
print("Indexing completed")
|
||||||
|
|
||||||
|
|
||||||
def display_plugins(plugins):
|
def display_plugins(plugins):
|
||||||
table = []
|
table = []
|
||||||
for plugin in plugins:
|
for plugin in plugins:
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
import json
|
import json
|
||||||
|
from typing import List
|
||||||
|
|
||||||
from .schemas import MetadataType, EntityMetadata
|
from .schemas import MetadataType, EntityMetadata, EntityIndexItem, MetadataIndexItem
|
||||||
|
|
||||||
|
|
||||||
def convert_metadata_value(metadata: EntityMetadata):
|
def convert_metadata_value(metadata: EntityMetadata):
|
||||||
@ -16,31 +17,28 @@ def convert_metadata_value(metadata: EntityMetadata):
|
|||||||
|
|
||||||
|
|
||||||
def upsert(client, entity):
|
def upsert(client, entity):
|
||||||
# Prepare the entity data for Typesense
|
entity_data = EntityIndexItem(
|
||||||
entity_data = {
|
id=str(entity.id),
|
||||||
"id": str(entity.id),
|
filepath=entity.filepath,
|
||||||
"filepath": entity.filepath,
|
filename=entity.filename,
|
||||||
"filename": entity.filename,
|
size=entity.size,
|
||||||
"size": entity.size,
|
file_created_at=int(entity.file_created_at.timestamp()),
|
||||||
"file_created_at": int(entity.file_created_at.timestamp()),
|
file_last_modified_at=int(entity.file_last_modified_at.timestamp()),
|
||||||
"file_last_modified_at": int(entity.file_last_modified_at.timestamp()),
|
file_type=entity.file_type,
|
||||||
"file_type": entity.file_type,
|
file_type_group=entity.file_type_group,
|
||||||
"file_type_group": entity.file_type_group,
|
last_scan_at=int(entity.last_scan_at.timestamp()) if entity.last_scan_at else None,
|
||||||
"last_scan_at": (
|
library_id=entity.library_id,
|
||||||
int(entity.last_scan_at.timestamp()) if entity.last_scan_at else None
|
folder_id=entity.folder_id,
|
||||||
),
|
tags=[tag.name for tag in entity.tags],
|
||||||
"library_id": entity.library_id,
|
metadata_entries=[
|
||||||
"folder_id": entity.folder_id,
|
MetadataIndexItem(
|
||||||
"tags": [tag.name for tag in entity.tags],
|
key=metadata.key,
|
||||||
"metadata_entries": [
|
value=convert_metadata_value(metadata),
|
||||||
{
|
source=metadata.source,
|
||||||
"key": metadata.key,
|
)
|
||||||
"value": convert_metadata_value(metadata),
|
|
||||||
"source": metadata.source,
|
|
||||||
}
|
|
||||||
for metadata in entity.metadata_entries
|
for metadata in entity.metadata_entries
|
||||||
],
|
],
|
||||||
"metadata_text": "\n\n".join(
|
metadata_text="\n\n".join(
|
||||||
[
|
[
|
||||||
(
|
(
|
||||||
f"key: {metadata.key}\nvalue:\n{json.dumps(json.loads(metadata.value), indent=2)}"
|
f"key: {metadata.key}\nvalue:\n{json.dumps(json.loads(metadata.value), indent=2)}"
|
||||||
@ -50,12 +48,61 @@ def upsert(client, entity):
|
|||||||
for metadata in entity.metadata_entries
|
for metadata in entity.metadata_entries
|
||||||
]
|
]
|
||||||
),
|
),
|
||||||
}
|
)
|
||||||
|
|
||||||
# Sync the entity data to Typesense
|
# Sync the entity data to Typesense
|
||||||
try:
|
try:
|
||||||
client.collections["entities"].documents.upsert(entity_data)
|
client.collections["entities"].documents.upsert(entity_data.model_dump_json())
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise Exception(
|
raise Exception(
|
||||||
f"Failed to sync entity to Typesense: {str(e)}",
|
f"Failed to sync entity to Typesense: {str(e)}",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def remove_entity_by_id(client, entity_id):
|
||||||
|
try:
|
||||||
|
client.collections["entities"].documents[entity_id].delete()
|
||||||
|
except Exception as e:
|
||||||
|
raise Exception(
|
||||||
|
f"Failed to remove entity from Typesense: {str(e)}",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def list_all_entities(client, library_id: int, folder_id: int, limit=100, offset=0) -> List[EntityIndexItem]:
|
||||||
|
try:
|
||||||
|
response = client.collections["entities"].documents.search(
|
||||||
|
{
|
||||||
|
"q": "*",
|
||||||
|
"filter_by": f"library_id:={library_id} && folder_id:={folder_id}",
|
||||||
|
"per_page": limit,
|
||||||
|
"page": offset // limit + 1,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return [
|
||||||
|
EntityIndexItem(
|
||||||
|
id=hit["document"]["id"],
|
||||||
|
filepath=hit["document"]["filepath"],
|
||||||
|
filename=hit["document"]["filename"],
|
||||||
|
size=hit["document"]["size"],
|
||||||
|
file_created_at=hit["document"]["file_created_at"],
|
||||||
|
file_last_modified_at=hit["document"]["file_last_modified_at"],
|
||||||
|
file_type=hit["document"]["file_type"],
|
||||||
|
file_type_group=hit["document"]["file_type_group"],
|
||||||
|
last_scan_at=hit["document"].get("last_scan_at"),
|
||||||
|
library_id=hit["document"]["library_id"],
|
||||||
|
folder_id=hit["document"]["folder_id"],
|
||||||
|
tags=hit["document"]["tags"],
|
||||||
|
metadata_entries=[
|
||||||
|
MetadataIndexItem(
|
||||||
|
key=entry["key"],
|
||||||
|
value=entry["value"],
|
||||||
|
source=entry["source"]
|
||||||
|
) for entry in hit["document"]["metadata_entries"]
|
||||||
|
],
|
||||||
|
metadata_text=hit["document"]["metadata_text"]
|
||||||
|
) for hit in response["hits"]
|
||||||
|
]
|
||||||
|
except Exception as e:
|
||||||
|
raise Exception(
|
||||||
|
f"Failed to list entities for library {library_id} and folder {folder_id}: {str(e)}",
|
||||||
|
)
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
from pydantic import BaseModel, ConfigDict, DirectoryPath, HttpUrl
|
from pydantic import BaseModel, ConfigDict, DirectoryPath, HttpUrl, Field
|
||||||
from typing import List
|
from typing import List, Optional, Any
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
|
|
||||||
@ -140,3 +140,25 @@ class Entity(BaseModel):
|
|||||||
|
|
||||||
model_config = ConfigDict(from_attributes=True)
|
model_config = ConfigDict(from_attributes=True)
|
||||||
|
|
||||||
|
|
||||||
|
class MetadataIndexItem(BaseModel):
|
||||||
|
key: str
|
||||||
|
value: Any
|
||||||
|
source: str
|
||||||
|
|
||||||
|
|
||||||
|
class EntityIndexItem(BaseModel):
|
||||||
|
id: str
|
||||||
|
filepath: str
|
||||||
|
filename: str
|
||||||
|
size: int
|
||||||
|
file_created_at: int = Field(..., description="Unix timestamp")
|
||||||
|
file_last_modified_at: int = Field(..., description="Unix timestamp")
|
||||||
|
file_type: str
|
||||||
|
file_type_group: str
|
||||||
|
last_scan_at: Optional[int] = Field(None, description="Unix timestamp")
|
||||||
|
library_id: int
|
||||||
|
folder_id: int
|
||||||
|
tags: List[str]
|
||||||
|
metadata_entries: List[MetadataIndexItem]
|
||||||
|
metadata_text: str
|
||||||
|
@ -28,6 +28,8 @@ from .schemas import (
|
|||||||
UpdateEntityTagsParam,
|
UpdateEntityTagsParam,
|
||||||
UpdateEntityMetadataParam,
|
UpdateEntityMetadataParam,
|
||||||
MetadataType,
|
MetadataType,
|
||||||
|
EntityIndexItem,
|
||||||
|
MetadataIndexItem,
|
||||||
)
|
)
|
||||||
|
|
||||||
engine = create_engine(f"sqlite:///{get_database_path()}")
|
engine = create_engine(f"sqlite:///{get_database_path()}")
|
||||||
@ -280,6 +282,55 @@ async def sync_entity_to_typesense(entity_id: int, db: Session = Depends(get_db)
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
@app.delete(
|
||||||
|
"/entities/{entity_id}/index",
|
||||||
|
status_code=status.HTTP_204_NO_CONTENT,
|
||||||
|
tags=["entity"],
|
||||||
|
)
|
||||||
|
async def remove_entity_from_typesense(entity_id: int, db: Session = Depends(get_db)):
|
||||||
|
entity = crud.get_entity_by_id(entity_id, db)
|
||||||
|
if entity is None:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_404_NOT_FOUND,
|
||||||
|
detail="Entity not found",
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
indexing.remove_entity_by_id(client, entity_id)
|
||||||
|
except Exception as e:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||||
|
detail=str(e),
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
|
@app.get(
|
||||||
|
"/libraries/{library_id}/folders/{folder_id}/index",
|
||||||
|
response_model=List[EntityIndexItem],
|
||||||
|
tags=["entity"],
|
||||||
|
)
|
||||||
|
def list_entities_in_folder(
|
||||||
|
library_id: int,
|
||||||
|
folder_id: int,
|
||||||
|
limit: Annotated[int, Query(ge=1, le=200)] = 10,
|
||||||
|
offset: int = 0,
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
):
|
||||||
|
library = crud.get_library_by_id(library_id, db)
|
||||||
|
if library is None:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_404_NOT_FOUND, detail="Library not found"
|
||||||
|
)
|
||||||
|
|
||||||
|
if folder_id not in [folder.id for folder in library.folders]:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_404_NOT_FOUND,
|
||||||
|
detail="Folder not found in the specified library",
|
||||||
|
)
|
||||||
|
|
||||||
|
return indexing.list_all_entities(client, library_id, folder_id, limit, offset)
|
||||||
|
|
||||||
|
|
||||||
@app.patch("/entities/{entity_id}/tags", response_model=Entity, tags=["entity"])
|
@app.patch("/entities/{entity_id}/tags", response_model=Entity, tags=["entity"])
|
||||||
@app.put("/entities/{entity_id}/tags", response_model=Entity, tags=["entity"])
|
@app.put("/entities/{entity_id}/tags", response_model=Entity, tags=["entity"])
|
||||||
def patch_entity_tags(
|
def patch_entity_tags(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user