refact: remove typesense related code

This commit is contained in:
arkohut 2024-11-06 00:19:31 +08:00
parent 53d2549344
commit a56cc80e14
14 changed files with 16 additions and 1185 deletions

View File

@ -683,149 +683,6 @@ async def index_batch(client, entity_ids):
return index_response
@lib_app.command("typesense-index")
def typesense_index(
library_id: int,
folders: List[int] = typer.Option(None, "--folder", "-f"),
force: bool = typer.Option(False, "--force", help="Force update all indexes"),
batchsize: int = typer.Option(
4, "--batchsize", "-bs", help="Number of entities to index in a batch"
),
):
print(f"Indexing library {library_id}")
# Get the library
response = httpx.get(f"{BASE_URL}/libraries/{library_id}")
if response.status_code != 200:
print(f"Failed to get library: {response.status_code} - {response.text}")
return
library = response.json()
scanned_entities = set()
# Filter folders if the folders parameter is provided
if folders:
library_folders = [
folder for folder in library["folders"] if folder["id"] in folders
]
else:
library_folders = library["folders"]
async def process_folders():
async with httpx.AsyncClient(timeout=60) as client:
# Iterate through folders
for folder in library_folders:
tqdm.write(f"Processing folder: {folder['id']}")
# List all entities in the folder
limit = 200
offset = 0
total_entities = 0 # We'll update this after the first request
with tqdm(
total=total_entities, desc="Indexing entities", leave=True
) as pbar:
while True:
entities_response = await client.get(
f"{BASE_URL}/libraries/{library_id}/folders/{folder['id']}/entities",
params={"limit": limit, "offset": offset},
)
if entities_response.status_code != 200:
pbar.write(
f"Failed to get entities: {entities_response.status_code} - {entities_response.text}"
)
break
entities = entities_response.json()
if not entities:
break
# Update total if this is the first request
if offset == 0:
total_entities = int(
entities_response.headers.get(
"X-Total-Count", total_entities
)
)
pbar.total = total_entities
pbar.refresh()
# Index each entity
for i in range(0, len(entities), batchsize):
batch = entities[i : i + batchsize]
to_index = []
for entity in batch:
needs_indexing = force or await check_and_index_entity(
client, entity["id"], entity["last_scan_at"]
)
if needs_indexing:
to_index.append(entity["id"])
if to_index:
index_response = await index_batch(client, to_index)
if index_response.status_code == 204:
pbar.write(
f"Indexed batch of {len(to_index)} entities"
)
else:
pbar.write(
f"Failed to index batch: {index_response.status_code} - {index_response.text}"
)
scanned_entities.update(
str(entity["id"]) for entity in batch
)
pbar.update(len(batch))
offset += limit
# List all indexed entities in the folder
offset = 0
print(f"Starting cleanup process for folder {folder['id']}")
while True:
index_response = await client.get(
f"{BASE_URL}/libraries/{library_id}/folders/{folder['id']}/index",
params={"limit": 200, "offset": offset},
)
if index_response.status_code != 200:
tqdm.write(
f"Failed to get indexed entities: {index_response.status_code} - {index_response.text}"
)
break
indexed_entities = index_response.json()
if not indexed_entities:
print("No more indexed entities to process")
break
# Delete indexes for entities not in scanned_entities
for indexed_entity in tqdm(
indexed_entities, desc="Cleaning up indexes", leave=False
):
if indexed_entity["id"] not in scanned_entities:
tqdm.write(
f"Entity {indexed_entity['id']} not in scanned entities, deleting index"
)
delete_response = await client.delete(
f"{BASE_URL}/entities/{indexed_entity['id']}/index"
)
if delete_response.status_code == 204:
tqdm.write(
f"Deleted index for entity: {indexed_entity['id']}"
)
else:
tqdm.write(
f"Failed to delete index for entity {indexed_entity['id']}: {delete_response.status_code} - {delete_response.text}"
)
offset += 200
print(f"Finished cleanup process for folder {folder['id']}")
asyncio.run(process_folders())
print("Indexing completed")
@lib_app.command("sync")
def sync(
library_id: int,

View File

@ -8,7 +8,6 @@ import httpx
import typer
from .config import settings, display_config
from .models import init_database
from .initialize_typesense import init_typesense
from .record import (
run_screen_recorder_once,
run_screen_recorder,
@ -19,7 +18,7 @@ import sys
import subprocess
import platform
from .cmds.plugin import plugin_app, bind
from .cmds.library import lib_app, scan, typesense_index, reindex, watch
from .cmds.library import lib_app, scan, reindex, watch
import psutil
import signal
from tabulate import tabulate
@ -54,7 +53,6 @@ def callback(ctx: typer.Context):
# List of commands that require the server to be running
server_dependent_commands = [
"scan",
"typesense-index",
"reindex",
"watch",
@ -82,12 +80,8 @@ app.add_typer(lib_app, name="lib", callback=callback)
def serve():
"""Run the server after initializing if necessary."""
db_success = init_database()
ts_success = True
if settings.typesense.enabled:
ts_success = init_typesense()
if db_success and (ts_success or not settings.typesense.enabled):
if db_success:
from .server import run_server
run_server()
else:
print("Server initialization failed. Unable to start the server.")
@ -95,12 +89,9 @@ def serve():
@app.command()
def init():
"""Initialize the database and Typesense collection if enabled."""
"""Initialize the database."""
db_success = init_database()
ts_success = True
if settings.typesense.enabled:
ts_success = init_typesense()
if db_success and (ts_success or not settings.typesense.enabled):
if db_success:
print("Initialization completed successfully.")
else:
print("Initialization failed. Please check the error messages above.")
@ -180,36 +171,6 @@ def scan_default_library(
scan(default_library["id"], path=path, plugins=plugins, folders=folders, force=force)
@app.command("typesense-index")
def typsense_index_default_library(
batchsize: int = typer.Option(
4, "--batchsize", "-bs", help="Number of entities to index in a batch"
),
force: bool = typer.Option(False, "--force", help="Force update all indexes"),
):
"""
Index the default library for memos.
"""
# Get the default library
response = httpx.get(f"{BASE_URL}/libraries")
if response.status_code != 200:
print(f"Failed to retrieve libraries: {response.status_code} - {response.text}")
return
libraries = response.json()
default_library = next(
(lib for lib in libraries if lib["name"] == settings.default_library), None
)
if not default_library:
print("Default library does not exist.")
return
typesense_index(
default_library["id"], force=force, folders=None, batchsize=batchsize
)
@app.command("reindex")
def reindex_default_library(
force: bool = typer.Option(

View File

@ -45,17 +45,6 @@ class EmbeddingSettings(BaseModel):
use_local: bool = True
class TypesenseSettings(BaseModel):
# is disabled by default, and right now is quite unnecessary
enabled: bool = False
host: str = "localhost"
port: str = "8108"
protocol: str = "http"
api_key: str = "xyz"
connection_timeout_seconds: int = 10
collection_name: str = "entities"
class Settings(BaseSettings):
model_config = SettingsConfigDict(
yaml_file=str(Path.home() / ".memos" / "config.yaml"),
@ -81,9 +70,6 @@ class Settings(BaseSettings):
# Embedding settings
embedding: EmbeddingSettings = EmbeddingSettings()
# Typesense settings
typesense: TypesenseSettings = TypesenseSettings()
batchsize: int = 1
auth_username: str = "admin"
@ -162,9 +148,6 @@ settings = Settings()
# Define the default database path
os.makedirs(settings.resolved_base_dir, exist_ok=True)
# Global variable for Typesense collection name
TYPESENSE_COLLECTION_NAME = settings.typesense.collection_name
# Function to get the database path from environment variable or default
def get_database_path():
@ -172,9 +155,7 @@ def get_database_path():
def format_value(value):
if isinstance(
value, (VLMSettings, OCRSettings, EmbeddingSettings, TypesenseSettings)
):
if isinstance(value, (VLMSettings, OCRSettings, EmbeddingSettings)):
return (
"{\n"
+ "\n".join(f" {k}: {v}" for k, v in value.model_dump().items())

View File

@ -48,13 +48,3 @@ embedding:
# num_dim: 1536
# use_local: false
# use_modelscope: false
typesense:
enabled: false
api_key: xyz
collection_name: entities
connection_timeout_seconds: 10
host: localhost
port: '8108'
protocol: http

View File

@ -1,394 +0,0 @@
import json
import httpx
from typing import List
from datetime import datetime
from .schemas import (
MetadataType,
EntityMetadata,
EntityIndexItem,
MetadataIndexItem,
EntitySearchResult,
SearchResult,
Facet,
SearchHit,
TextMatchInfo,
HybridSearchInfo,
RequestParams,
)
from .config import settings, TYPESENSE_COLLECTION_NAME
from .embedding import get_embeddings
def convert_metadata_value(metadata: EntityMetadata):
if metadata.data_type == MetadataType.JSON_DATA:
return json.loads(metadata.value)
else:
return metadata.value
def parse_date_fields(entity):
timestamp_metadata = next(
(m for m in entity.metadata_entries if m.key == "timestamp"), None
)
if timestamp_metadata and len(timestamp_metadata.value) == 15:
try:
dt = datetime.strptime(timestamp_metadata.value, "%Y%m%d-%H%M%S")
except ValueError:
dt = entity.file_created_at
else:
dt = entity.file_created_at
return {
"created_date": dt.strftime("%Y-%m-%d"),
"created_month": dt.strftime("%Y-%m"),
"created_year": dt.strftime("%Y"),
}
def generate_metadata_text(metadata_entries):
# 暂时不使用ocr结果
def process_ocr_result(metadata):
try:
ocr_data = json.loads(metadata.value)
if isinstance(ocr_data, list) and all(
isinstance(item, dict)
and "dt_boxes" in item
and "rec_txt" in item
and "score" in item
for item in ocr_data
):
return " ".join(item["rec_txt"] for item in ocr_data)
else:
return json.dumps(ocr_data, indent=2)
except json.JSONDecodeError:
return metadata.value
non_ocr_metadata = [
(
f"key: {metadata.key}\nvalue:\n{json.dumps(json.loads(metadata.value), indent=2)}"
if metadata.data_type == MetadataType.JSON_DATA
else f"key: {metadata.key}\nvalue:\n{metadata.value}"
)
for metadata in metadata_entries
if metadata.key != "ocr_result"
]
metadata_text = "\n\n".join(non_ocr_metadata)
return metadata_text
async def bulk_upsert(client, entities):
documents = []
metadata_texts = []
entities_with_metadata = []
for entity in entities:
metadata_text = generate_metadata_text(entity.metadata_entries)
print(f"metadata_text: {len(metadata_text)}")
if metadata_text:
metadata_texts.append(metadata_text)
entities_with_metadata.append(entity)
documents.append(
EntityIndexItem(
id=str(entity.id),
filepath=entity.filepath,
filename=entity.filename,
size=entity.size,
file_created_at=int(entity.file_created_at.timestamp()),
file_last_modified_at=int(entity.file_last_modified_at.timestamp()),
file_type=entity.file_type,
file_type_group=entity.file_type_group,
last_scan_at=(
int(entity.last_scan_at.timestamp())
if entity.last_scan_at
else None
),
library_id=entity.library_id,
folder_id=entity.folder_id,
tags=[tag.name for tag in entity.tags],
metadata_entries=[
MetadataIndexItem(
key=metadata.key,
value=convert_metadata_value(metadata),
source=metadata.source,
)
for metadata in entity.metadata_entries
],
metadata_text=metadata_text,
**parse_date_fields(entity),
).model_dump(mode="json")
)
embeddings = await get_embeddings(metadata_texts)
for doc, embedding, entity in zip(documents, embeddings, entities):
if entity in entities_with_metadata:
doc["embedding"] = embedding
# Sync the entity data to Typesense
try:
response = client.collections[TYPESENSE_COLLECTION_NAME].documents.import_(
documents, {"action": "upsert"}
)
return response
except Exception as e:
raise Exception(
f"Failed to sync entities to Typesense: {str(e)}",
)
async def upsert(client, entity):
date_fields = parse_date_fields(entity)
metadata_text = generate_metadata_text(entity.metadata_entries)
embedding = (await get_embeddings([metadata_text]))[0]
entity_data = EntityIndexItem(
id=str(entity.id),
filepath=entity.filepath,
filename=entity.filename,
size=entity.size,
file_created_at=int(entity.file_created_at.timestamp()),
file_last_modified_at=int(entity.file_last_modified_at.timestamp()),
file_type=entity.file_type,
file_type_group=entity.file_type_group,
last_scan_at=(
int(entity.last_scan_at.timestamp()) if entity.last_scan_at else None
),
library_id=entity.library_id,
folder_id=entity.folder_id,
tags=[tag.name for tag in entity.tags],
metadata_entries=[
MetadataIndexItem(
key=metadata.key,
value=convert_metadata_value(metadata),
source=metadata.source,
)
for metadata in entity.metadata_entries
],
metadata_text=metadata_text,
embedding=embedding,
created_date=date_fields.get("created_date"),
created_month=date_fields.get("created_month"),
created_year=date_fields.get("created_year"),
)
# Sync the entity data to Typesense
try:
client.collections[TYPESENSE_COLLECTION_NAME].documents.upsert(
entity_data.model_dump_json()
)
except Exception as e:
raise Exception(
f"Failed to sync entity to Typesense: {str(e)}",
)
def remove_entity_by_id(client, entity_id):
try:
client.collections[TYPESENSE_COLLECTION_NAME].documents[entity_id].delete()
except Exception as e:
raise Exception(
f"Failed to remove entity from Typesense: {str(e)}",
)
def list_all_entities(
client, library_id: int, folder_id: int, limit=100, offset=0
) -> List[EntityIndexItem]:
try:
response = client.collections[TYPESENSE_COLLECTION_NAME].documents.search(
{
"q": "*",
"filter_by": f"library_id:={library_id} && folder_id:={folder_id}",
"per_page": limit,
"page": offset // limit + 1,
}
)
return [
EntityIndexItem(
id=hit["document"]["id"],
filepath=hit["document"]["filepath"],
filename=hit["document"]["filename"],
size=hit["document"]["size"],
file_created_at=hit["document"]["file_created_at"],
file_last_modified_at=hit["document"]["file_last_modified_at"],
file_type=hit["document"]["file_type"],
file_type_group=hit["document"]["file_type_group"],
last_scan_at=hit["document"].get("last_scan_at"),
library_id=hit["document"]["library_id"],
folder_id=hit["document"]["folder_id"],
tags=hit["document"]["tags"],
metadata_entries=[
MetadataIndexItem(
key=entry["key"], value=entry["value"], source=entry["source"]
)
for entry in hit["document"]["metadata_entries"]
],
metadata_text=hit["document"]["metadata_text"],
created_date=hit["document"].get("created_date"),
created_month=hit["document"].get("created_month"),
created_year=hit["document"].get("created_year"),
)
for hit in response["hits"]
]
except Exception as e:
raise Exception(
f"Failed to list entities for library {library_id} and folder {folder_id}: {str(e)}",
)
async def search_entities(
client,
q: str,
library_ids: List[int] = None,
folder_ids: List[int] = None,
tags: List[str] = None,
created_dates: List[str] = None,
limit: int = 48,
offset: int = 0,
start: int = None,
end: int = None,
) -> SearchResult:
try:
filter_by = []
if library_ids:
filter_by.append(f"library_id:[{','.join(map(str, library_ids))}]")
if folder_ids:
filter_by.append(f"folder_id:[{','.join(map(str, folder_ids))}]")
if start is not None and end is not None:
filter_by.append(f"file_created_at:={start}..{end}")
if tags:
filter_by.append(f"tags:=[{','.join(tags)}]")
if created_dates:
filter_by.append(f"created_date:[{','.join(created_dates)}]")
filter_by_str = " && ".join(filter_by) if filter_by else ""
# Convert q to embedding using get_embeddings and take the first embedding
embedding = (await get_embeddings([q]))[0]
common_search_params = {
"collection": TYPESENSE_COLLECTION_NAME,
}
search_parameters = {
"q": q,
"query_by": "tags,filename,filepath,metadata_text",
"infix": "off,always,always,off",
"prefix": "true,true,true,false",
"filter_by": (
f"{filter_by_str} && file_type_group:=image"
if filter_by_str
else "file_type_group:=image"
),
"limit": limit,
"offset": offset,
"exclude_fields": "metadata_text,embedding",
"sort_by": "_text_match:desc,file_created_at:desc",
"facet_by": "created_date,created_month,created_year,tags",
"vector_query": f"embedding:({embedding}, k:{limit})",
}
search_parameters_to_print = search_parameters.copy()
search_parameters_to_print["vector_query"] = f"embedding:([...], k:{limit})"
print(json.dumps(search_parameters_to_print, indent=2))
search_response = client.multi_search.perform(
{"searches": [search_parameters]}, common_search_params
)
search_results = search_response["results"][0]
hits = [
SearchHit(
document=EntitySearchResult(
id=hit["document"]["id"],
filepath=hit["document"]["filepath"],
filename=hit["document"]["filename"],
size=hit["document"]["size"],
file_created_at=hit["document"]["file_created_at"],
file_last_modified_at=hit["document"]["file_last_modified_at"],
file_type=hit["document"]["file_type"],
file_type_group=hit["document"]["file_type_group"],
last_scan_at=hit["document"].get("last_scan_at"),
library_id=hit["document"]["library_id"],
folder_id=hit["document"]["folder_id"],
tags=hit["document"]["tags"],
metadata_entries=[
MetadataIndexItem(
key=entry["key"],
value=entry["value"],
source=entry["source"],
)
for entry in hit["document"]["metadata_entries"]
],
created_date=hit["document"].get("created_date"),
created_month=hit["document"].get("created_month"),
created_year=hit["document"].get("created_year"),
),
highlight=hit.get("highlight", {}),
highlights=hit.get("highlights", []),
hybrid_search_info=(
HybridSearchInfo(**hit["hybrid_search_info"])
if hit.get("hybrid_search_info")
else None
),
text_match=hit.get("text_match"),
text_match_info=(
TextMatchInfo(**hit["text_match_info"])
if hit.get("text_match_info")
else None
),
)
for hit in search_results["hits"]
]
return SearchResult(
facet_counts=[Facet(**facet) for facet in search_results["facet_counts"]],
found=search_results["found"],
hits=hits,
out_of=search_results["out_of"],
page=search_results["page"],
request_params=RequestParams(**search_results["request_params"]),
search_cutoff=search_results["search_cutoff"],
search_time_ms=search_results["search_time_ms"],
)
except Exception as e:
raise Exception(
f"Failed to search entities: {str(e)}",
)
def fetch_entity_by_id(client, id: str) -> EntityIndexItem:
try:
document = (
client.collections[TYPESENSE_COLLECTION_NAME].documents[id].retrieve()
)
return EntitySearchResult(
id=document["id"],
filepath=document["filepath"],
filename=document["filename"],
size=document["size"],
file_created_at=document["file_created_at"],
file_last_modified_at=document["file_last_modified_at"],
file_type=document["file_type"],
file_type_group=document["file_type_group"],
last_scan_at=document.get("last_scan_at"),
library_id=document["library_id"],
folder_id=document["folder_id"],
tags=document["tags"],
metadata_entries=[
MetadataIndexItem(
key=entry["key"], value=entry["value"], source=entry["source"]
)
for entry in document["metadata_entries"]
],
created_date=document.get("created_date"),
created_month=document.get("created_month"),
created_year=document.get("created_year"),
)
except Exception as e:
raise Exception(
f"Failed to fetch document by id: {str(e)}",
)

View File

@ -1,170 +0,0 @@
import typesense
from .config import settings, TYPESENSE_COLLECTION_NAME
import sys
import logging
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Define the schema for the Typesense collection
schema = {
"name": TYPESENSE_COLLECTION_NAME,
"enable_nested_fields": True,
"fields": [
{"name": "filepath", "type": "string", "infix": True},
{"name": "filename", "type": "string", "infix": True},
{"name": "size", "type": "int32"},
{"name": "file_created_at", "type": "int64", "facet": False},
{
"name": "created_date",
"type": "string",
"facet": True,
"optional": True,
"sort": True,
},
{
"name": "created_month",
"type": "string",
"facet": True,
"optional": True,
"sort": True,
},
{
"name": "created_year",
"type": "string",
"facet": True,
"optional": True,
"sort": True,
},
{"name": "file_last_modified_at", "type": "int64", "facet": False},
{"name": "file_type", "type": "string", "facet": True},
{"name": "file_type_group", "type": "string", "facet": True},
{"name": "last_scan_at", "type": "int64", "facet": False, "optional": True},
{"name": "library_id", "type": "int32", "facet": True},
{"name": "folder_id", "type": "int32", "facet": True},
{
"name": "tags",
"type": "string[]",
"facet": True,
"optional": True,
"locale": "zh",
},
{
"name": "metadata_entries",
"type": "object[]",
"optional": True,
"locale": "zh",
},
{"name": "metadata_text", "type": "string", "optional": True, "locale": "zh"},
{
"name": "embedding",
"type": "float[]",
"num_dim": settings.embedding.num_dim,
"optional": True,
},
{
"name": "image_embedding",
"type": "float[]",
"optional": True,
},
],
"token_separators": [":", "/", " ", "\\"],
}
def update_collection_fields(client, schema):
existing_collection = client.collections[TYPESENSE_COLLECTION_NAME].retrieve()
existing_fields = {field["name"]: field for field in existing_collection["fields"]}
new_fields = {field["name"]: field for field in schema["fields"]}
fields_to_add = []
for name, field in new_fields.items():
if name not in existing_fields:
fields_to_add.append(field)
else:
# Check if the field can be updated
updatable_properties = ["facet", "optional"]
for prop in updatable_properties:
if prop in field and field[prop] != existing_fields[name].get(prop):
fields_to_add.append(field)
break
if fields_to_add:
client.collections[TYPESENSE_COLLECTION_NAME].update({"fields": fields_to_add})
print(
f"Added/updated {len(fields_to_add)} fields in the '{TYPESENSE_COLLECTION_NAME}' collection."
)
else:
print(
f"No new fields to add or update in the '{TYPESENSE_COLLECTION_NAME}' collection."
)
def init_typesense():
"""Initialize the Typesense collection."""
if not settings.typesense.enabled:
logger.warning("Typesense is not enabled. Skipping initialization.")
return False
try:
client = typesense.Client(
{
"nodes": [
{
"host": settings.typesense_host,
"port": settings.typesense_port,
"protocol": settings.typesense_protocol,
}
],
"api_key": settings.typesense_api_key,
"connection_timeout_seconds": settings.typesense_connection_timeout_seconds,
}
)
existing_collections = client.collections.retrieve()
collection_names = [c["name"] for c in existing_collections]
if TYPESENSE_COLLECTION_NAME not in collection_names:
client.collections.create(schema)
logger.info(f"Typesense collection '{TYPESENSE_COLLECTION_NAME}' created successfully.")
else:
update_collection_fields(client, schema)
logger.info(f"Typesense collection '{TYPESENSE_COLLECTION_NAME}' already exists. Updated fields if necessary.")
return True
except Exception as e:
logger.error(f"Error initializing Typesense collection: {e}")
return False
if __name__ == "__main__":
import argparse
import sys
parser = argparse.ArgumentParser()
parser.add_argument("--force", action="store_true", help="Drop the collection before initializing")
args = parser.parse_args()
if not settings.typesense.enabled:
logger.warning("Typesense is not enabled. Please enable it in the configuration if you want to use Typesense.")
sys.exit(0)
client = typesense.Client(
{
"nodes": [
{
"host": settings.typesense_host,
"port": settings.typesense_port,
"protocol": settings.typesense_protocol,
}
],
"api_key": settings.typesense_api_key,
"connection_timeout_seconds": settings.typesense_connection_timeout_seconds,
}
)
if args.force:
try:
client.collections[TYPESENSE_COLLECTION_NAME].delete()
logger.info(f"Dropped collection '{TYPESENSE_COLLECTION_NAME}'.")
except Exception as e:
logger.error(f"Error dropping collection: {e}")
if not init_typesense():
sys.exit(1)

View File

@ -195,27 +195,6 @@ class MetadataIndexItem(BaseModel):
source: str
class EntityIndexItem(BaseModel):
id: str
filepath: str
filename: str
size: int
file_created_at: int = Field(..., description="Unix timestamp")
created_date: Optional[str] = None
created_month: Optional[str] = None
created_year: Optional[str] = None
file_last_modified_at: int = Field(..., description="Unix timestamp")
file_type: str
file_type_group: str
last_scan_at: Optional[int] = Field(None, description="Unix timestamp")
library_id: int
folder_id: int
tags: List[str]
metadata_entries: List[MetadataIndexItem]
metadata_text: str
embedding: Optional[List[float]] = Field(None, description="Embedding vector")
class EntitySearchResult(BaseModel):
id: str
filepath: str

View File

@ -20,12 +20,10 @@ from secrets import compare_digest
import functools
import logging
import typesense
from .config import get_database_path, settings
from memos.plugins.vlm import main as vlm_main
from memos.plugins.ocr import main as ocr_main
from . import crud, indexing
from . import crud
from .schemas import (
Library,
Folder,
@ -40,7 +38,6 @@ from .schemas import (
UpdateEntityTagsParam,
UpdateEntityMetadataParam,
MetadataType,
EntityIndexItem,
MetadataIndexItem,
EntitySearchResult,
SearchResult,
@ -62,23 +59,6 @@ engine = create_engine(f"sqlite:///{get_database_path()}")
event.listen(engine, "connect", load_extension)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
# Initialize Typesense client only if enabled
client = None
if settings.typesense.enabled:
client = typesense.Client(
{
"nodes": [
{
"host": settings.typesense.host,
"port": settings.typesense.port,
"protocol": settings.typesense.protocol,
}
],
"api_key": settings.typesense.api_key,
"connection_timeout_seconds": settings.typesense.connection_timeout_seconds,
}
)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
@ -405,175 +385,6 @@ def update_entity_last_scan_at(entity_id: int, db: Session = Depends(get_db)):
)
def typesense_required(func):
@functools.wraps(func)
async def wrapper(*args, **kwargs):
if not settings.typesense.enabled:
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail="Typesense is not enabled",
)
return await func(*args, **kwargs)
return wrapper
@app.post(
"/entities/{entity_id}/index",
status_code=status.HTTP_204_NO_CONTENT,
tags=["entity"],
)
@typesense_required
async def sync_entity_to_typesense(entity_id: int, db: Session = Depends(get_db)):
entity = crud.get_entity_by_id(entity_id, db)
if entity is None:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Entity not found",
)
try:
indexing.upsert(client, entity)
except Exception as e:
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=str(e),
)
return None
@app.post(
"/entities/batch-index",
status_code=status.HTTP_204_NO_CONTENT,
tags=["entity"],
)
@typesense_required
async def batch_sync_entities_to_typesense(
entity_ids: List[int], db: Session = Depends(get_db)
):
entities = crud.find_entities_by_ids(entity_ids, db)
if not entities:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="No entities found",
)
try:
await indexing.bulk_upsert(client, entities)
except Exception as e:
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=str(e),
)
return None
@app.get(
"/entities/{entity_id}/index",
response_model=EntitySearchResult,
tags=["entity"],
)
@typesense_required
async def get_entity_index(entity_id: int) -> EntityIndexItem:
try:
entity_index_item = indexing.fetch_entity_by_id(client, entity_id)
except Exception as e:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=str(e),
)
return entity_index_item
@app.delete(
"/entities/{entity_id}/index",
status_code=status.HTTP_204_NO_CONTENT,
tags=["entity"],
)
@typesense_required
async def remove_entity_from_typesense(entity_id: int, db: Session = Depends(get_db)):
try:
indexing.remove_entity_by_id(client, entity_id)
except Exception as e:
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=str(e),
)
return None
@app.get(
"/libraries/{library_id}/folders/{folder_id}/index",
response_model=List[EntityIndexItem],
tags=["entity"],
)
@typesense_required
def list_entitiy_indices_in_folder(
library_id: int,
folder_id: int,
limit: Annotated[int, Query(ge=1, le=200)] = 10,
offset: int = 0,
db: Session = Depends(get_db),
):
library = crud.get_library_by_id(library_id, db)
if library is None:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND, detail="Library not found"
)
if folder_id not in [folder.id for folder in library.folders]:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Folder not found in the specified library",
)
return indexing.list_all_entities(client, library_id, folder_id, limit, offset)
@app.get("/search/v2", response_model=SearchResult, tags=["search"])
@typesense_required
async def search_entities(
q: str,
library_ids: str = Query(None, description="Comma-separated list of library IDs"),
folder_ids: str = Query(None, description="Comma-separated list of folder IDs"),
tags: str = Query(None, description="Comma-separated list of tags"),
created_dates: str = Query(
None, description="Comma-separated list of created dates in YYYY-MM-DD format"
),
limit: Annotated[int, Query(ge=1, le=200)] = 48,
offset: int = 0,
start: int = None,
end: int = None,
db: Session = Depends(get_db),
):
library_ids = [int(id) for id in library_ids.split(",")] if library_ids else None
folder_ids = [int(id) for id in folder_ids.split(",")] if folder_ids else None
tags = [tag.strip() for tag in tags.split(",")] if tags else None
created_dates = (
[date.strip() for date in created_dates.split(",")] if created_dates else None
)
try:
return await indexing.search_entities(
client,
q,
library_ids,
folder_ids,
tags,
created_dates,
limit,
offset,
start,
end,
)
except Exception as e:
print(f"Error searching entities: {e}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=str(e),
)
@app.put("/entities/{entity_id}/tags", response_model=Entity, tags=["entity"])
def replace_entity_tags(
entity_id: int, update_tags: UpdateEntityTagsParam, db: Session = Depends(get_db)
@ -925,28 +736,15 @@ def get_entity_context(
library_id=library_id,
entity_id=entity_id,
prev=prev_count,
next=next_count
next=next_count,
)
# Return the context object
return EntityContext(
prev=prev_entities,
next=next_entities
)
return EntityContext(prev=prev_entities, next=next_entities)
def run_server():
logging.info("Database path: %s", get_database_path())
if settings.typesense.enabled:
logging.info(
"Typesense connection info: Host: %s, Port: %s, Protocol: %s, Collection Name: %s",
settings.typesense.host,
settings.typesense.port,
settings.typesense.protocol,
settings.typesense.collection_name,
)
else:
logging.info("Typesense is disabled")
logging.info("VLM plugin enabled: %s", settings.vlm)
logging.info("OCR plugin enabled: %s", settings.ocr)

View File

@ -24,7 +24,6 @@ dependencies = [
"typer",
"magika",
"pydantic-settings",
"typesense",
"opencv-python",
"pillow",
"piexif",

View File

@ -1,10 +0,0 @@
fastapi
uvicorn
httpx
pydantic
sqlalchemy
typer
tabulate
magika
pydantic-settings
typesense

View File

@ -1,12 +0,0 @@
@echo off
set TYPESENSE_API_KEY=xyz
if not exist "%CD%\typesense-data" mkdir "%CD%\typesense-data"
docker run -d -p 8108:8108 ^
-v "%CD%\typesense-data:/data" typesense/typesense:26.0 ^
--add-host=host.docker.internal:host-gateway ^
--data-dir /data ^
--api-key=%TYPESENSE_API_KEY% ^
--enable-cors

View File

@ -1,11 +0,0 @@
export TYPESENSE_API_KEY=xyz
mkdir "$(pwd)"/typesense-data
docker run -d -p 8108:8108 \
--restart always \
-v"$(pwd)"/typesense-data:/data typesense/typesense:27.0 \
--add-host=host.docker.internal:host-gateway \
--data-dir /data \
--api-key=$TYPESENSE_API_KEY \
--enable-cors

138
web/package-lock.json generated
View File

@ -20,8 +20,7 @@
"svelte-i18n": "^4.0.0",
"svelte-sonner": "^0.3.27",
"tailwind-merge": "^2.4.0",
"tailwind-variants": "^0.2.1",
"typesense": "^1.7.2"
"tailwind-variants": "^0.2.1"
},
"devDependencies": {
"@sveltejs/adapter-auto": "^2.1.1",
@ -69,18 +68,6 @@
"node": ">=6.0.0"
}
},
"node_modules/@babel/runtime": {
"version": "7.24.7",
"resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.24.7.tgz",
"integrity": "sha512-UwgBRMjJP+xv857DCngvqXI3Iq6J4v0wXmwc6sapg+zyhbwmQX67LUEFrkK5tbyJ30jGuG3ZvWpBiB9LCy1kWw==",
"peer": true,
"dependencies": {
"regenerator-runtime": "^0.14.0"
},
"engines": {
"node": ">=6.9.0"
}
},
"node_modules/@cspotcode/source-map-support": {
"version": "0.8.1",
"resolved": "https://registry.npmjs.org/@cspotcode/source-map-support/-/source-map-support-0.8.1.tgz",
@ -1711,11 +1698,6 @@
"node": ">=8"
}
},
"node_modules/asynckit": {
"version": "0.4.0",
"resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz",
"integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q=="
},
"node_modules/autoprefixer": {
"version": "10.4.19",
"resolved": "https://registry.npmjs.org/autoprefixer/-/autoprefixer-10.4.19.tgz",
@ -1753,16 +1735,6 @@
"postcss": "^8.1.0"
}
},
"node_modules/axios": {
"version": "1.7.2",
"resolved": "https://registry.npmjs.org/axios/-/axios-1.7.2.tgz",
"integrity": "sha512-2A8QhOMrbomlDuiLeK9XibIBzuHeRcqqNOHp0Cyp5EoJ1IFDh+XZH3A6BkXtv0K4gFGCI0Y4BM7B1wOEi0Rmgw==",
"dependencies": {
"follow-redirects": "^1.15.6",
"form-data": "^4.0.0",
"proxy-from-env": "^1.1.0"
}
},
"node_modules/axobject-query": {
"version": "4.0.0",
"resolved": "https://registry.npmjs.org/axobject-query/-/axobject-query-4.0.0.tgz",
@ -2039,17 +2011,6 @@
"simple-swizzle": "^0.2.2"
}
},
"node_modules/combined-stream": {
"version": "1.0.8",
"resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz",
"integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==",
"dependencies": {
"delayed-stream": "~1.0.0"
},
"engines": {
"node": ">= 0.8"
}
},
"node_modules/commander": {
"version": "4.1.1",
"resolved": "https://registry.npmjs.org/commander/-/commander-4.1.1.tgz",
@ -2225,14 +2186,6 @@
"node": ">=0.10.0"
}
},
"node_modules/delayed-stream": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz",
"integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==",
"engines": {
"node": ">=0.4.0"
}
},
"node_modules/dequal": {
"version": "2.0.3",
"resolved": "https://registry.npmjs.org/dequal/-/dequal-2.0.3.tgz",
@ -2831,25 +2784,6 @@
"tabbable": "^6.2.0"
}
},
"node_modules/follow-redirects": {
"version": "1.15.6",
"resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz",
"integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==",
"funding": [
{
"type": "individual",
"url": "https://github.com/sponsors/RubenVerborgh"
}
],
"engines": {
"node": ">=4.0"
},
"peerDependenciesMeta": {
"debug": {
"optional": true
}
}
},
"node_modules/foreground-child": {
"version": "3.2.1",
"resolved": "https://registry.npmjs.org/foreground-child/-/foreground-child-3.2.1.tgz",
@ -2865,19 +2799,6 @@
"url": "https://github.com/sponsors/isaacs"
}
},
"node_modules/form-data": {
"version": "4.0.0",
"resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.0.tgz",
"integrity": "sha512-ETEklSGi5t0QMZuiXoA/Q6vcnxcLQP5vdugSpuAyi6SVGi2clPPp+xgEhuMaHC+zGgn31Kd235W35f7Hykkaww==",
"dependencies": {
"asynckit": "^0.4.0",
"combined-stream": "^1.0.8",
"mime-types": "^2.1.12"
},
"engines": {
"node": ">= 6"
}
},
"node_modules/fraction.js": {
"version": "4.3.7",
"resolved": "https://registry.npmjs.org/fraction.js/-/fraction.js-4.3.7.tgz",
@ -3353,18 +3274,6 @@
"resolved": "https://registry.npmjs.org/lodash.merge/-/lodash.merge-4.6.2.tgz",
"integrity": "sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ=="
},
"node_modules/loglevel": {
"version": "1.9.1",
"resolved": "https://registry.npmjs.org/loglevel/-/loglevel-1.9.1.tgz",
"integrity": "sha512-hP3I3kCrDIMuRwAwHltphhDM1r8i55H33GgqjXbrisuJhF4kRhW1dNuxsRklp4bXl8DSdLaNLuiL4A/LWRfxvg==",
"engines": {
"node": ">= 0.6.0"
},
"funding": {
"type": "tidelift",
"url": "https://tidelift.com/funding/github/npm/loglevel"
}
},
"node_modules/lru-cache": {
"version": "10.3.0",
"resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-10.3.0.tgz",
@ -3458,25 +3367,6 @@
"node": ">=8.6"
}
},
"node_modules/mime-db": {
"version": "1.52.0",
"resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz",
"integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==",
"engines": {
"node": ">= 0.6"
}
},
"node_modules/mime-types": {
"version": "2.1.35",
"resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz",
"integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==",
"dependencies": {
"mime-db": "1.52.0"
},
"engines": {
"node": ">= 0.6"
}
},
"node_modules/min-indent": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/min-indent/-/min-indent-1.0.1.tgz",
@ -4009,11 +3899,6 @@
"svelte": "^3.2.0 || ^4.0.0-next.0"
}
},
"node_modules/proxy-from-env": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz",
"integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg=="
},
"node_modules/punycode": {
"version": "2.3.1",
"resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.1.tgz",
@ -4061,12 +3946,6 @@
"node": ">=8.10.0"
}
},
"node_modules/regenerator-runtime": {
"version": "0.14.1",
"resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.14.1.tgz",
"integrity": "sha512-dYnhHh0nJoMfnkZs6GmmhFknAGRrLznOu5nc9ML+EJxGvrx6H7teuevqVqCuPcPK//3eDrrjQhehXVx9cnkGdw==",
"peer": true
},
"node_modules/resolve": {
"version": "1.22.8",
"resolved": "https://registry.npmjs.org/resolve/-/resolve-1.22.8.tgz",
@ -5376,21 +5255,6 @@
"node": ">=14.17"
}
},
"node_modules/typesense": {
"version": "1.8.2",
"resolved": "https://registry.npmjs.org/typesense/-/typesense-1.8.2.tgz",
"integrity": "sha512-aBpePjA99Qvo+OP2pJwMpvga4Jrm1Y2oV5NsrWXBxlqUDNEUCPZBIksPv2Hq0jxQxHhLLyJVbjXjByXsvpCDVA==",
"dependencies": {
"axios": "^1.6.0",
"loglevel": "^1.8.1"
},
"engines": {
"node": ">=18"
},
"peerDependencies": {
"@babel/runtime": "^7.23.2"
}
},
"node_modules/undici": {
"version": "5.28.4",
"resolved": "https://registry.npmjs.org/undici/-/undici-5.28.4.tgz",

View File

@ -48,7 +48,6 @@
"svelte-i18n": "^4.0.0",
"svelte-sonner": "^0.3.27",
"tailwind-merge": "^2.4.0",
"tailwind-variants": "^0.2.1",
"typesense": "^1.7.2"
"tailwind-variants": "^0.2.1"
}
}