From a56cc80e14aa5bb4758e9eb5403d8ce7a1354544 Mon Sep 17 00:00:00 2001 From: arkohut <39525455+arkohut@users.noreply.github.com> Date: Wed, 6 Nov 2024 00:19:31 +0800 Subject: [PATCH] refact: remove typesense related code --- memos/cmds/library.py | 143 ------------ memos/commands.py | 47 +--- memos/config.py | 23 +- memos/default_config.yaml | 10 - memos/indexing.py | 394 ---------------------------------- memos/initialize_typesense.py | 170 --------------- memos/schemas.py | 21 -- memos/server.py | 218 +------------------ pyproject.toml | 1 - requirements.txt | 10 - search-engine/run-docker.bat | 12 -- search-engine/run-docker.sh | 11 - web/package-lock.json | 138 +----------- web/package.json | 3 +- 14 files changed, 16 insertions(+), 1185 deletions(-) delete mode 100644 memos/indexing.py delete mode 100644 memos/initialize_typesense.py delete mode 100644 requirements.txt delete mode 100644 search-engine/run-docker.bat delete mode 100644 search-engine/run-docker.sh diff --git a/memos/cmds/library.py b/memos/cmds/library.py index 6d35d32..3b2fcae 100644 --- a/memos/cmds/library.py +++ b/memos/cmds/library.py @@ -683,149 +683,6 @@ async def index_batch(client, entity_ids): return index_response -@lib_app.command("typesense-index") -def typesense_index( - library_id: int, - folders: List[int] = typer.Option(None, "--folder", "-f"), - force: bool = typer.Option(False, "--force", help="Force update all indexes"), - batchsize: int = typer.Option( - 4, "--batchsize", "-bs", help="Number of entities to index in a batch" - ), -): - print(f"Indexing library {library_id}") - - # Get the library - response = httpx.get(f"{BASE_URL}/libraries/{library_id}") - if response.status_code != 200: - print(f"Failed to get library: {response.status_code} - {response.text}") - return - - library = response.json() - scanned_entities = set() - - # Filter folders if the folders parameter is provided - if folders: - library_folders = [ - folder for folder in library["folders"] if folder["id"] in folders - ] - else: - library_folders = library["folders"] - - async def process_folders(): - async with httpx.AsyncClient(timeout=60) as client: - # Iterate through folders - for folder in library_folders: - tqdm.write(f"Processing folder: {folder['id']}") - - # List all entities in the folder - limit = 200 - offset = 0 - total_entities = 0 # We'll update this after the first request - with tqdm( - total=total_entities, desc="Indexing entities", leave=True - ) as pbar: - while True: - entities_response = await client.get( - f"{BASE_URL}/libraries/{library_id}/folders/{folder['id']}/entities", - params={"limit": limit, "offset": offset}, - ) - if entities_response.status_code != 200: - pbar.write( - f"Failed to get entities: {entities_response.status_code} - {entities_response.text}" - ) - break - - entities = entities_response.json() - if not entities: - break - - # Update total if this is the first request - if offset == 0: - total_entities = int( - entities_response.headers.get( - "X-Total-Count", total_entities - ) - ) - pbar.total = total_entities - pbar.refresh() - - # Index each entity - for i in range(0, len(entities), batchsize): - batch = entities[i : i + batchsize] - to_index = [] - - for entity in batch: - needs_indexing = force or await check_and_index_entity( - client, entity["id"], entity["last_scan_at"] - ) - if needs_indexing: - to_index.append(entity["id"]) - - if to_index: - index_response = await index_batch(client, to_index) - if index_response.status_code == 204: - pbar.write( - f"Indexed batch of {len(to_index)} entities" - ) - else: - pbar.write( - f"Failed to index batch: {index_response.status_code} - {index_response.text}" - ) - - scanned_entities.update( - str(entity["id"]) for entity in batch - ) - pbar.update(len(batch)) - - offset += limit - - # List all indexed entities in the folder - offset = 0 - print(f"Starting cleanup process for folder {folder['id']}") - while True: - index_response = await client.get( - f"{BASE_URL}/libraries/{library_id}/folders/{folder['id']}/index", - params={"limit": 200, "offset": offset}, - ) - if index_response.status_code != 200: - tqdm.write( - f"Failed to get indexed entities: {index_response.status_code} - {index_response.text}" - ) - break - - indexed_entities = index_response.json() - if not indexed_entities: - print("No more indexed entities to process") - break - - # Delete indexes for entities not in scanned_entities - for indexed_entity in tqdm( - indexed_entities, desc="Cleaning up indexes", leave=False - ): - if indexed_entity["id"] not in scanned_entities: - tqdm.write( - f"Entity {indexed_entity['id']} not in scanned entities, deleting index" - ) - delete_response = await client.delete( - f"{BASE_URL}/entities/{indexed_entity['id']}/index" - ) - if delete_response.status_code == 204: - tqdm.write( - f"Deleted index for entity: {indexed_entity['id']}" - ) - else: - tqdm.write( - f"Failed to delete index for entity {indexed_entity['id']}: {delete_response.status_code} - {delete_response.text}" - ) - - offset += 200 - - print(f"Finished cleanup process for folder {folder['id']}") - - asyncio.run(process_folders()) - print("Indexing completed") - - @lib_app.command("sync") def sync( library_id: int, diff --git a/memos/commands.py b/memos/commands.py index 77a0c8b..74cdda2 100644 --- a/memos/commands.py +++ b/memos/commands.py @@ -8,7 +8,6 @@ import httpx import typer from .config import settings, display_config from .models import init_database -from .initialize_typesense import init_typesense from .record import ( run_screen_recorder_once, run_screen_recorder, @@ -19,7 +18,7 @@ import sys import subprocess import platform from .cmds.plugin import plugin_app, bind -from .cmds.library import lib_app, scan, typesense_index, reindex, watch +from .cmds.library import lib_app, scan, reindex, watch import psutil import signal from tabulate import tabulate @@ -54,7 +53,6 @@ def callback(ctx: typer.Context): # List of commands that require the server to be running server_dependent_commands = [ "scan", - "typesense-index", "reindex", "watch", @@ -82,12 +80,8 @@ app.add_typer(lib_app, name="lib", callback=callback) def serve(): """Run the server after initializing if necessary.""" db_success = init_database() - ts_success = True - if settings.typesense.enabled: - ts_success = init_typesense() - if db_success and (ts_success or not settings.typesense.enabled): + if db_success: from .server import run_server - run_server() else: print("Server initialization failed. Unable to start the server.") @@ -95,12 +89,9 @@ def serve(): @app.command() def init(): - """Initialize the database and Typesense collection if enabled.""" + """Initialize the database.""" db_success = init_database() - ts_success = True - if settings.typesense.enabled: - ts_success = init_typesense() - if db_success and (ts_success or not settings.typesense.enabled): + if db_success: print("Initialization completed successfully.") else: print("Initialization failed. Please check the error messages above.") @@ -180,36 +171,6 @@ def scan_default_library( scan(default_library["id"], path=path, plugins=plugins, folders=folders, force=force) -@app.command("typesense-index") -def typsense_index_default_library( - batchsize: int = typer.Option( - 4, "--batchsize", "-bs", help="Number of entities to index in a batch" - ), - force: bool = typer.Option(False, "--force", help="Force update all indexes"), -): - """ - Index the default library for memos. - """ - # Get the default library - response = httpx.get(f"{BASE_URL}/libraries") - if response.status_code != 200: - print(f"Failed to retrieve libraries: {response.status_code} - {response.text}") - return - - libraries = response.json() - default_library = next( - (lib for lib in libraries if lib["name"] == settings.default_library), None - ) - - if not default_library: - print("Default library does not exist.") - return - - typesense_index( - default_library["id"], force=force, folders=None, batchsize=batchsize - ) - - @app.command("reindex") def reindex_default_library( force: bool = typer.Option( diff --git a/memos/config.py b/memos/config.py index ba05e7b..4673e08 100644 --- a/memos/config.py +++ b/memos/config.py @@ -45,17 +45,6 @@ class EmbeddingSettings(BaseModel): use_local: bool = True -class TypesenseSettings(BaseModel): - # is disabled by default, and right now is quite unnecessary - enabled: bool = False - host: str = "localhost" - port: str = "8108" - protocol: str = "http" - api_key: str = "xyz" - connection_timeout_seconds: int = 10 - collection_name: str = "entities" - - class Settings(BaseSettings): model_config = SettingsConfigDict( yaml_file=str(Path.home() / ".memos" / "config.yaml"), @@ -81,9 +70,6 @@ class Settings(BaseSettings): # Embedding settings embedding: EmbeddingSettings = EmbeddingSettings() - # Typesense settings - typesense: TypesenseSettings = TypesenseSettings() - batchsize: int = 1 auth_username: str = "admin" @@ -116,7 +102,7 @@ class Settings(BaseSettings): @property def resolved_screenshots_dir(self) -> Path: return self.resolved_base_dir / self.screenshots_dir - + @property def server_endpoint(self) -> str: host = "127.0.0.1" if self.server_host == "0.0.0.0" else self.server_host @@ -162,9 +148,6 @@ settings = Settings() # Define the default database path os.makedirs(settings.resolved_base_dir, exist_ok=True) -# Global variable for Typesense collection name -TYPESENSE_COLLECTION_NAME = settings.typesense.collection_name - # Function to get the database path from environment variable or default def get_database_path(): @@ -172,9 +155,7 @@ def get_database_path(): def format_value(value): - if isinstance( - value, (VLMSettings, OCRSettings, EmbeddingSettings, TypesenseSettings) - ): + if isinstance(value, (VLMSettings, OCRSettings, EmbeddingSettings)): return ( "{\n" + "\n".join(f" {k}: {v}" for k, v in value.model_dump().items()) diff --git a/memos/default_config.yaml b/memos/default_config.yaml index 01aeee0..4af0058 100644 --- a/memos/default_config.yaml +++ b/memos/default_config.yaml @@ -48,13 +48,3 @@ embedding: # num_dim: 1536 # use_local: false # use_modelscope: false - -typesense: - enabled: false - api_key: xyz - collection_name: entities - connection_timeout_seconds: 10 - host: localhost - port: '8108' - protocol: http - diff --git a/memos/indexing.py b/memos/indexing.py deleted file mode 100644 index 9da83fb..0000000 --- a/memos/indexing.py +++ /dev/null @@ -1,394 +0,0 @@ -import json -import httpx -from typing import List -from datetime import datetime - -from .schemas import ( - MetadataType, - EntityMetadata, - EntityIndexItem, - MetadataIndexItem, - EntitySearchResult, - SearchResult, - Facet, - SearchHit, - TextMatchInfo, - HybridSearchInfo, - RequestParams, -) -from .config import settings, TYPESENSE_COLLECTION_NAME -from .embedding import get_embeddings - - -def convert_metadata_value(metadata: EntityMetadata): - if metadata.data_type == MetadataType.JSON_DATA: - return json.loads(metadata.value) - else: - return metadata.value - - -def parse_date_fields(entity): - timestamp_metadata = next( - (m for m in entity.metadata_entries if m.key == "timestamp"), None - ) - - if timestamp_metadata and len(timestamp_metadata.value) == 15: - try: - dt = datetime.strptime(timestamp_metadata.value, "%Y%m%d-%H%M%S") - except ValueError: - dt = entity.file_created_at - else: - dt = entity.file_created_at - - return { - "created_date": dt.strftime("%Y-%m-%d"), - "created_month": dt.strftime("%Y-%m"), - "created_year": dt.strftime("%Y"), - } - - -def generate_metadata_text(metadata_entries): - # 暂时不使用ocr结果 - def process_ocr_result(metadata): - try: - ocr_data = json.loads(metadata.value) - if isinstance(ocr_data, list) and all( - isinstance(item, dict) - and "dt_boxes" in item - and "rec_txt" in item - and "score" in item - for item in ocr_data - ): - return " ".join(item["rec_txt"] for item in ocr_data) - else: - return json.dumps(ocr_data, indent=2) - except json.JSONDecodeError: - return metadata.value - - non_ocr_metadata = [ - ( - f"key: {metadata.key}\nvalue:\n{json.dumps(json.loads(metadata.value), indent=2)}" - if metadata.data_type == MetadataType.JSON_DATA - else f"key: {metadata.key}\nvalue:\n{metadata.value}" - ) - for metadata in metadata_entries - if metadata.key != "ocr_result" - ] - metadata_text = "\n\n".join(non_ocr_metadata) - return metadata_text - - -async def bulk_upsert(client, entities): - documents = [] - metadata_texts = [] - entities_with_metadata = [] - - for entity in entities: - metadata_text = generate_metadata_text(entity.metadata_entries) - print(f"metadata_text: {len(metadata_text)}") - if metadata_text: - metadata_texts.append(metadata_text) - entities_with_metadata.append(entity) - - documents.append( - EntityIndexItem( - id=str(entity.id), - filepath=entity.filepath, - filename=entity.filename, - size=entity.size, - file_created_at=int(entity.file_created_at.timestamp()), - file_last_modified_at=int(entity.file_last_modified_at.timestamp()), - file_type=entity.file_type, - file_type_group=entity.file_type_group, - last_scan_at=( - int(entity.last_scan_at.timestamp()) - if entity.last_scan_at - else None - ), - library_id=entity.library_id, - folder_id=entity.folder_id, - tags=[tag.name for tag in entity.tags], - metadata_entries=[ - MetadataIndexItem( - key=metadata.key, - value=convert_metadata_value(metadata), - source=metadata.source, - ) - for metadata in entity.metadata_entries - ], - metadata_text=metadata_text, - **parse_date_fields(entity), - ).model_dump(mode="json") - ) - - embeddings = await get_embeddings(metadata_texts) - for doc, embedding, entity in zip(documents, embeddings, entities): - if entity in entities_with_metadata: - doc["embedding"] = embedding - - # Sync the entity data to Typesense - try: - response = client.collections[TYPESENSE_COLLECTION_NAME].documents.import_( - documents, {"action": "upsert"} - ) - return response - except Exception as e: - raise Exception( - f"Failed to sync entities to Typesense: {str(e)}", - ) - - -async def upsert(client, entity): - date_fields = parse_date_fields(entity) - metadata_text = generate_metadata_text(entity.metadata_entries) - embedding = (await get_embeddings([metadata_text]))[0] - - entity_data = EntityIndexItem( - id=str(entity.id), - filepath=entity.filepath, - filename=entity.filename, - size=entity.size, - file_created_at=int(entity.file_created_at.timestamp()), - file_last_modified_at=int(entity.file_last_modified_at.timestamp()), - file_type=entity.file_type, - file_type_group=entity.file_type_group, - last_scan_at=( - int(entity.last_scan_at.timestamp()) if entity.last_scan_at else None - ), - library_id=entity.library_id, - folder_id=entity.folder_id, - tags=[tag.name for tag in entity.tags], - metadata_entries=[ - MetadataIndexItem( - key=metadata.key, - value=convert_metadata_value(metadata), - source=metadata.source, - ) - for metadata in entity.metadata_entries - ], - metadata_text=metadata_text, - embedding=embedding, - created_date=date_fields.get("created_date"), - created_month=date_fields.get("created_month"), - created_year=date_fields.get("created_year"), - ) - - # Sync the entity data to Typesense - try: - client.collections[TYPESENSE_COLLECTION_NAME].documents.upsert( - entity_data.model_dump_json() - ) - except Exception as e: - raise Exception( - f"Failed to sync entity to Typesense: {str(e)}", - ) - - -def remove_entity_by_id(client, entity_id): - try: - client.collections[TYPESENSE_COLLECTION_NAME].documents[entity_id].delete() - except Exception as e: - raise Exception( - f"Failed to remove entity from Typesense: {str(e)}", - ) - - -def list_all_entities( - client, library_id: int, folder_id: int, limit=100, offset=0 -) -> List[EntityIndexItem]: - try: - response = client.collections[TYPESENSE_COLLECTION_NAME].documents.search( - { - "q": "*", - "filter_by": f"library_id:={library_id} && folder_id:={folder_id}", - "per_page": limit, - "page": offset // limit + 1, - } - ) - return [ - EntityIndexItem( - id=hit["document"]["id"], - filepath=hit["document"]["filepath"], - filename=hit["document"]["filename"], - size=hit["document"]["size"], - file_created_at=hit["document"]["file_created_at"], - file_last_modified_at=hit["document"]["file_last_modified_at"], - file_type=hit["document"]["file_type"], - file_type_group=hit["document"]["file_type_group"], - last_scan_at=hit["document"].get("last_scan_at"), - library_id=hit["document"]["library_id"], - folder_id=hit["document"]["folder_id"], - tags=hit["document"]["tags"], - metadata_entries=[ - MetadataIndexItem( - key=entry["key"], value=entry["value"], source=entry["source"] - ) - for entry in hit["document"]["metadata_entries"] - ], - metadata_text=hit["document"]["metadata_text"], - created_date=hit["document"].get("created_date"), - created_month=hit["document"].get("created_month"), - created_year=hit["document"].get("created_year"), - ) - for hit in response["hits"] - ] - except Exception as e: - raise Exception( - f"Failed to list entities for library {library_id} and folder {folder_id}: {str(e)}", - ) - - -async def search_entities( - client, - q: str, - library_ids: List[int] = None, - folder_ids: List[int] = None, - tags: List[str] = None, - created_dates: List[str] = None, - limit: int = 48, - offset: int = 0, - start: int = None, - end: int = None, -) -> SearchResult: - try: - filter_by = [] - if library_ids: - filter_by.append(f"library_id:[{','.join(map(str, library_ids))}]") - if folder_ids: - filter_by.append(f"folder_id:[{','.join(map(str, folder_ids))}]") - if start is not None and end is not None: - filter_by.append(f"file_created_at:={start}..{end}") - if tags: - filter_by.append(f"tags:=[{','.join(tags)}]") - if created_dates: - filter_by.append(f"created_date:[{','.join(created_dates)}]") - - filter_by_str = " && ".join(filter_by) if filter_by else "" - - # Convert q to embedding using get_embeddings and take the first embedding - embedding = (await get_embeddings([q]))[0] - - common_search_params = { - "collection": TYPESENSE_COLLECTION_NAME, - } - - search_parameters = { - "q": q, - "query_by": "tags,filename,filepath,metadata_text", - "infix": "off,always,always,off", - "prefix": "true,true,true,false", - "filter_by": ( - f"{filter_by_str} && file_type_group:=image" - if filter_by_str - else "file_type_group:=image" - ), - "limit": limit, - "offset": offset, - "exclude_fields": "metadata_text,embedding", - "sort_by": "_text_match:desc,file_created_at:desc", - "facet_by": "created_date,created_month,created_year,tags", - "vector_query": f"embedding:({embedding}, k:{limit})", - } - - search_parameters_to_print = search_parameters.copy() - search_parameters_to_print["vector_query"] = f"embedding:([...], k:{limit})" - print(json.dumps(search_parameters_to_print, indent=2)) - - search_response = client.multi_search.perform( - {"searches": [search_parameters]}, common_search_params - ) - - search_results = search_response["results"][0] - - hits = [ - SearchHit( - document=EntitySearchResult( - id=hit["document"]["id"], - filepath=hit["document"]["filepath"], - filename=hit["document"]["filename"], - size=hit["document"]["size"], - file_created_at=hit["document"]["file_created_at"], - file_last_modified_at=hit["document"]["file_last_modified_at"], - file_type=hit["document"]["file_type"], - file_type_group=hit["document"]["file_type_group"], - last_scan_at=hit["document"].get("last_scan_at"), - library_id=hit["document"]["library_id"], - folder_id=hit["document"]["folder_id"], - tags=hit["document"]["tags"], - metadata_entries=[ - MetadataIndexItem( - key=entry["key"], - value=entry["value"], - source=entry["source"], - ) - for entry in hit["document"]["metadata_entries"] - ], - created_date=hit["document"].get("created_date"), - created_month=hit["document"].get("created_month"), - created_year=hit["document"].get("created_year"), - ), - highlight=hit.get("highlight", {}), - highlights=hit.get("highlights", []), - hybrid_search_info=( - HybridSearchInfo(**hit["hybrid_search_info"]) - if hit.get("hybrid_search_info") - else None - ), - text_match=hit.get("text_match"), - text_match_info=( - TextMatchInfo(**hit["text_match_info"]) - if hit.get("text_match_info") - else None - ), - ) - for hit in search_results["hits"] - ] - - return SearchResult( - facet_counts=[Facet(**facet) for facet in search_results["facet_counts"]], - found=search_results["found"], - hits=hits, - out_of=search_results["out_of"], - page=search_results["page"], - request_params=RequestParams(**search_results["request_params"]), - search_cutoff=search_results["search_cutoff"], - search_time_ms=search_results["search_time_ms"], - ) - except Exception as e: - raise Exception( - f"Failed to search entities: {str(e)}", - ) - - -def fetch_entity_by_id(client, id: str) -> EntityIndexItem: - try: - document = ( - client.collections[TYPESENSE_COLLECTION_NAME].documents[id].retrieve() - ) - return EntitySearchResult( - id=document["id"], - filepath=document["filepath"], - filename=document["filename"], - size=document["size"], - file_created_at=document["file_created_at"], - file_last_modified_at=document["file_last_modified_at"], - file_type=document["file_type"], - file_type_group=document["file_type_group"], - last_scan_at=document.get("last_scan_at"), - library_id=document["library_id"], - folder_id=document["folder_id"], - tags=document["tags"], - metadata_entries=[ - MetadataIndexItem( - key=entry["key"], value=entry["value"], source=entry["source"] - ) - for entry in document["metadata_entries"] - ], - created_date=document.get("created_date"), - created_month=document.get("created_month"), - created_year=document.get("created_year"), - ) - except Exception as e: - raise Exception( - f"Failed to fetch document by id: {str(e)}", - ) diff --git a/memos/initialize_typesense.py b/memos/initialize_typesense.py deleted file mode 100644 index a5bb936..0000000 --- a/memos/initialize_typesense.py +++ /dev/null @@ -1,170 +0,0 @@ -import typesense -from .config import settings, TYPESENSE_COLLECTION_NAME -import sys -import logging - -# Configure logging -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - -# Define the schema for the Typesense collection -schema = { - "name": TYPESENSE_COLLECTION_NAME, - "enable_nested_fields": True, - "fields": [ - {"name": "filepath", "type": "string", "infix": True}, - {"name": "filename", "type": "string", "infix": True}, - {"name": "size", "type": "int32"}, - {"name": "file_created_at", "type": "int64", "facet": False}, - { - "name": "created_date", - "type": "string", - "facet": True, - "optional": True, - "sort": True, - }, - { - "name": "created_month", - "type": "string", - "facet": True, - "optional": True, - "sort": True, - }, - { - "name": "created_year", - "type": "string", - "facet": True, - "optional": True, - "sort": True, - }, - {"name": "file_last_modified_at", "type": "int64", "facet": False}, - {"name": "file_type", "type": "string", "facet": True}, - {"name": "file_type_group", "type": "string", "facet": True}, - {"name": "last_scan_at", "type": "int64", "facet": False, "optional": True}, - {"name": "library_id", "type": "int32", "facet": True}, - {"name": "folder_id", "type": "int32", "facet": True}, - { - "name": "tags", - "type": "string[]", - "facet": True, - "optional": True, - "locale": "zh", - }, - { - "name": "metadata_entries", - "type": "object[]", - "optional": True, - "locale": "zh", - }, - {"name": "metadata_text", "type": "string", "optional": True, "locale": "zh"}, - { - "name": "embedding", - "type": "float[]", - "num_dim": settings.embedding.num_dim, - "optional": True, - }, - { - "name": "image_embedding", - "type": "float[]", - "optional": True, - }, - ], - "token_separators": [":", "/", " ", "\\"], -} - -def update_collection_fields(client, schema): - existing_collection = client.collections[TYPESENSE_COLLECTION_NAME].retrieve() - existing_fields = {field["name"]: field for field in existing_collection["fields"]} - new_fields = {field["name"]: field for field in schema["fields"]} - - fields_to_add = [] - for name, field in new_fields.items(): - if name not in existing_fields: - fields_to_add.append(field) - else: - # Check if the field can be updated - updatable_properties = ["facet", "optional"] - for prop in updatable_properties: - if prop in field and field[prop] != existing_fields[name].get(prop): - fields_to_add.append(field) - break - - if fields_to_add: - client.collections[TYPESENSE_COLLECTION_NAME].update({"fields": fields_to_add}) - print( - f"Added/updated {len(fields_to_add)} fields in the '{TYPESENSE_COLLECTION_NAME}' collection." - ) - else: - print( - f"No new fields to add or update in the '{TYPESENSE_COLLECTION_NAME}' collection." - ) - -def init_typesense(): - """Initialize the Typesense collection.""" - if not settings.typesense.enabled: - logger.warning("Typesense is not enabled. Skipping initialization.") - return False - - try: - client = typesense.Client( - { - "nodes": [ - { - "host": settings.typesense_host, - "port": settings.typesense_port, - "protocol": settings.typesense_protocol, - } - ], - "api_key": settings.typesense_api_key, - "connection_timeout_seconds": settings.typesense_connection_timeout_seconds, - } - ) - - existing_collections = client.collections.retrieve() - collection_names = [c["name"] for c in existing_collections] - if TYPESENSE_COLLECTION_NAME not in collection_names: - client.collections.create(schema) - logger.info(f"Typesense collection '{TYPESENSE_COLLECTION_NAME}' created successfully.") - else: - update_collection_fields(client, schema) - logger.info(f"Typesense collection '{TYPESENSE_COLLECTION_NAME}' already exists. Updated fields if necessary.") - return True - except Exception as e: - logger.error(f"Error initializing Typesense collection: {e}") - return False - -if __name__ == "__main__": - import argparse - import sys - - parser = argparse.ArgumentParser() - parser.add_argument("--force", action="store_true", help="Drop the collection before initializing") - args = parser.parse_args() - - if not settings.typesense.enabled: - logger.warning("Typesense is not enabled. Please enable it in the configuration if you want to use Typesense.") - sys.exit(0) - - client = typesense.Client( - { - "nodes": [ - { - "host": settings.typesense_host, - "port": settings.typesense_port, - "protocol": settings.typesense_protocol, - } - ], - "api_key": settings.typesense_api_key, - "connection_timeout_seconds": settings.typesense_connection_timeout_seconds, - } - ) - - if args.force: - try: - client.collections[TYPESENSE_COLLECTION_NAME].delete() - logger.info(f"Dropped collection '{TYPESENSE_COLLECTION_NAME}'.") - except Exception as e: - logger.error(f"Error dropping collection: {e}") - - if not init_typesense(): - sys.exit(1) diff --git a/memos/schemas.py b/memos/schemas.py index 401c76e..069a1cf 100644 --- a/memos/schemas.py +++ b/memos/schemas.py @@ -195,27 +195,6 @@ class MetadataIndexItem(BaseModel): source: str -class EntityIndexItem(BaseModel): - id: str - filepath: str - filename: str - size: int - file_created_at: int = Field(..., description="Unix timestamp") - created_date: Optional[str] = None - created_month: Optional[str] = None - created_year: Optional[str] = None - file_last_modified_at: int = Field(..., description="Unix timestamp") - file_type: str - file_type_group: str - last_scan_at: Optional[int] = Field(None, description="Unix timestamp") - library_id: int - folder_id: int - tags: List[str] - metadata_entries: List[MetadataIndexItem] - metadata_text: str - embedding: Optional[List[float]] = Field(None, description="Embedding vector") - - class EntitySearchResult(BaseModel): id: str filepath: str diff --git a/memos/server.py b/memos/server.py index 52357d5..8444fe2 100644 --- a/memos/server.py +++ b/memos/server.py @@ -20,12 +20,10 @@ from secrets import compare_digest import functools import logging -import typesense - from .config import get_database_path, settings from memos.plugins.vlm import main as vlm_main from memos.plugins.ocr import main as ocr_main -from . import crud, indexing +from . import crud from .schemas import ( Library, Folder, @@ -40,7 +38,6 @@ from .schemas import ( UpdateEntityTagsParam, UpdateEntityMetadataParam, MetadataType, - EntityIndexItem, MetadataIndexItem, EntitySearchResult, SearchResult, @@ -62,23 +59,6 @@ engine = create_engine(f"sqlite:///{get_database_path()}") event.listen(engine, "connect", load_extension) SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) -# Initialize Typesense client only if enabled -client = None -if settings.typesense.enabled: - client = typesense.Client( - { - "nodes": [ - { - "host": settings.typesense.host, - "port": settings.typesense.port, - "protocol": settings.typesense.protocol, - } - ], - "api_key": settings.typesense.api_key, - "connection_timeout_seconds": settings.typesense.connection_timeout_seconds, - } - ) - app.add_middleware( CORSMiddleware, allow_origins=["*"], @@ -405,175 +385,6 @@ def update_entity_last_scan_at(entity_id: int, db: Session = Depends(get_db)): ) -def typesense_required(func): - @functools.wraps(func) - async def wrapper(*args, **kwargs): - if not settings.typesense.enabled: - raise HTTPException( - status_code=status.HTTP_503_SERVICE_UNAVAILABLE, - detail="Typesense is not enabled", - ) - return await func(*args, **kwargs) - - return wrapper - - -@app.post( - "/entities/{entity_id}/index", - status_code=status.HTTP_204_NO_CONTENT, - tags=["entity"], -) -@typesense_required -async def sync_entity_to_typesense(entity_id: int, db: Session = Depends(get_db)): - entity = crud.get_entity_by_id(entity_id, db) - if entity is None: - raise HTTPException( - status_code=status.HTTP_404_NOT_FOUND, - detail="Entity not found", - ) - - try: - indexing.upsert(client, entity) - except Exception as e: - raise HTTPException( - status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail=str(e), - ) - return None - - -@app.post( - "/entities/batch-index", - status_code=status.HTTP_204_NO_CONTENT, - tags=["entity"], -) -@typesense_required -async def batch_sync_entities_to_typesense( - entity_ids: List[int], db: Session = Depends(get_db) -): - entities = crud.find_entities_by_ids(entity_ids, db) - if not entities: - raise HTTPException( - status_code=status.HTTP_404_NOT_FOUND, - detail="No entities found", - ) - - try: - await indexing.bulk_upsert(client, entities) - except Exception as e: - raise HTTPException( - status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail=str(e), - ) - return None - - -@app.get( - "/entities/{entity_id}/index", - response_model=EntitySearchResult, - tags=["entity"], -) -@typesense_required -async def get_entity_index(entity_id: int) -> EntityIndexItem: - try: - entity_index_item = indexing.fetch_entity_by_id(client, entity_id) - except Exception as e: - raise HTTPException( - status_code=status.HTTP_404_NOT_FOUND, - detail=str(e), - ) - - return entity_index_item - - -@app.delete( - "/entities/{entity_id}/index", - status_code=status.HTTP_204_NO_CONTENT, - tags=["entity"], -) -@typesense_required -async def remove_entity_from_typesense(entity_id: int, db: Session = Depends(get_db)): - try: - indexing.remove_entity_by_id(client, entity_id) - except Exception as e: - raise HTTPException( - status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail=str(e), - ) - return None - - -@app.get( - "/libraries/{library_id}/folders/{folder_id}/index", - response_model=List[EntityIndexItem], - tags=["entity"], -) -@typesense_required -def list_entitiy_indices_in_folder( - library_id: int, - folder_id: int, - limit: Annotated[int, Query(ge=1, le=200)] = 10, - offset: int = 0, - db: Session = Depends(get_db), -): - library = crud.get_library_by_id(library_id, db) - if library is None: - raise HTTPException( - status_code=status.HTTP_404_NOT_FOUND, detail="Library not found" - ) - - if folder_id not in [folder.id for folder in library.folders]: - raise HTTPException( - status_code=status.HTTP_404_NOT_FOUND, - detail="Folder not found in the specified library", - ) - - return indexing.list_all_entities(client, library_id, folder_id, limit, offset) - - -@app.get("/search/v2", response_model=SearchResult, tags=["search"]) -@typesense_required -async def search_entities( - q: str, - library_ids: str = Query(None, description="Comma-separated list of library IDs"), - folder_ids: str = Query(None, description="Comma-separated list of folder IDs"), - tags: str = Query(None, description="Comma-separated list of tags"), - created_dates: str = Query( - None, description="Comma-separated list of created dates in YYYY-MM-DD format" - ), - limit: Annotated[int, Query(ge=1, le=200)] = 48, - offset: int = 0, - start: int = None, - end: int = None, - db: Session = Depends(get_db), -): - library_ids = [int(id) for id in library_ids.split(",")] if library_ids else None - folder_ids = [int(id) for id in folder_ids.split(",")] if folder_ids else None - tags = [tag.strip() for tag in tags.split(",")] if tags else None - created_dates = ( - [date.strip() for date in created_dates.split(",")] if created_dates else None - ) - try: - return await indexing.search_entities( - client, - q, - library_ids, - folder_ids, - tags, - created_dates, - limit, - offset, - start, - end, - ) - except Exception as e: - print(f"Error searching entities: {e}") - raise HTTPException( - status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail=str(e), - ) - - @app.put("/entities/{entity_id}/tags", response_model=Entity, tags=["entity"]) def replace_entity_tags( entity_id: int, update_tags: UpdateEntityTagsParam, db: Session = Depends(get_db) @@ -901,52 +712,39 @@ def get_entity_context( ): """ Get the context (previous and next entities) for a given entity. - + Args: library_id: The ID of the library entity_id: The ID of the target entity prev: Number of previous entities to fetch (optional) next: Number of next entities to fetch (optional) - + Returns: EntityContext object containing prev and next lists of entities """ # If both prev and next are None, return empty lists if prev is None and next is None: return EntityContext(prev=[], next=[]) - + # Convert None to 0 for the crud function prev_count = prev if prev is not None else 0 next_count = next if next is not None else 0 - + # Get the context entities prev_entities, next_entities = crud.get_entity_context( db=db, library_id=library_id, entity_id=entity_id, prev=prev_count, - next=next_count + next=next_count, ) - + # Return the context object - return EntityContext( - prev=prev_entities, - next=next_entities - ) + return EntityContext(prev=prev_entities, next=next_entities) def run_server(): logging.info("Database path: %s", get_database_path()) - if settings.typesense.enabled: - logging.info( - "Typesense connection info: Host: %s, Port: %s, Protocol: %s, Collection Name: %s", - settings.typesense.host, - settings.typesense.port, - settings.typesense.protocol, - settings.typesense.collection_name, - ) - else: - logging.info("Typesense is disabled") logging.info("VLM plugin enabled: %s", settings.vlm) logging.info("OCR plugin enabled: %s", settings.ocr) diff --git a/pyproject.toml b/pyproject.toml index 3be2b55..f586aa4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,6 @@ dependencies = [ "typer", "magika", "pydantic-settings", - "typesense", "opencv-python", "pillow", "piexif", diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 276e577..0000000 --- a/requirements.txt +++ /dev/null @@ -1,10 +0,0 @@ -fastapi -uvicorn -httpx -pydantic -sqlalchemy -typer -tabulate -magika -pydantic-settings -typesense diff --git a/search-engine/run-docker.bat b/search-engine/run-docker.bat deleted file mode 100644 index 845e342..0000000 --- a/search-engine/run-docker.bat +++ /dev/null @@ -1,12 +0,0 @@ -@echo off - -set TYPESENSE_API_KEY=xyz - -if not exist "%CD%\typesense-data" mkdir "%CD%\typesense-data" - -docker run -d -p 8108:8108 ^ - -v "%CD%\typesense-data:/data" typesense/typesense:26.0 ^ - --add-host=host.docker.internal:host-gateway ^ - --data-dir /data ^ - --api-key=%TYPESENSE_API_KEY% ^ - --enable-cors \ No newline at end of file diff --git a/search-engine/run-docker.sh b/search-engine/run-docker.sh deleted file mode 100644 index 7df6c3d..0000000 --- a/search-engine/run-docker.sh +++ /dev/null @@ -1,11 +0,0 @@ -export TYPESENSE_API_KEY=xyz - -mkdir "$(pwd)"/typesense-data - -docker run -d -p 8108:8108 \ - --restart always \ - -v"$(pwd)"/typesense-data:/data typesense/typesense:27.0 \ - --add-host=host.docker.internal:host-gateway \ - --data-dir /data \ - --api-key=$TYPESENSE_API_KEY \ - --enable-cors diff --git a/web/package-lock.json b/web/package-lock.json index 538c72c..1f3f96b 100644 --- a/web/package-lock.json +++ b/web/package-lock.json @@ -20,8 +20,7 @@ "svelte-i18n": "^4.0.0", "svelte-sonner": "^0.3.27", "tailwind-merge": "^2.4.0", - "tailwind-variants": "^0.2.1", - "typesense": "^1.7.2" + "tailwind-variants": "^0.2.1" }, "devDependencies": { "@sveltejs/adapter-auto": "^2.1.1", @@ -69,18 +68,6 @@ "node": ">=6.0.0" } }, - "node_modules/@babel/runtime": { - "version": "7.24.7", - "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.24.7.tgz", - "integrity": "sha512-UwgBRMjJP+xv857DCngvqXI3Iq6J4v0wXmwc6sapg+zyhbwmQX67LUEFrkK5tbyJ30jGuG3ZvWpBiB9LCy1kWw==", - "peer": true, - "dependencies": { - "regenerator-runtime": "^0.14.0" - }, - "engines": { - "node": ">=6.9.0" - } - }, "node_modules/@cspotcode/source-map-support": { "version": "0.8.1", "resolved": "https://registry.npmjs.org/@cspotcode/source-map-support/-/source-map-support-0.8.1.tgz", @@ -1711,11 +1698,6 @@ "node": ">=8" } }, - "node_modules/asynckit": { - "version": "0.4.0", - "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz", - "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==" - }, "node_modules/autoprefixer": { "version": "10.4.19", "resolved": "https://registry.npmjs.org/autoprefixer/-/autoprefixer-10.4.19.tgz", @@ -1753,16 +1735,6 @@ "postcss": "^8.1.0" } }, - "node_modules/axios": { - "version": "1.7.2", - "resolved": "https://registry.npmjs.org/axios/-/axios-1.7.2.tgz", - "integrity": "sha512-2A8QhOMrbomlDuiLeK9XibIBzuHeRcqqNOHp0Cyp5EoJ1IFDh+XZH3A6BkXtv0K4gFGCI0Y4BM7B1wOEi0Rmgw==", - "dependencies": { - "follow-redirects": "^1.15.6", - "form-data": "^4.0.0", - "proxy-from-env": "^1.1.0" - } - }, "node_modules/axobject-query": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/axobject-query/-/axobject-query-4.0.0.tgz", @@ -2039,17 +2011,6 @@ "simple-swizzle": "^0.2.2" } }, - "node_modules/combined-stream": { - "version": "1.0.8", - "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz", - "integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==", - "dependencies": { - "delayed-stream": "~1.0.0" - }, - "engines": { - "node": ">= 0.8" - } - }, "node_modules/commander": { "version": "4.1.1", "resolved": "https://registry.npmjs.org/commander/-/commander-4.1.1.tgz", @@ -2225,14 +2186,6 @@ "node": ">=0.10.0" } }, - "node_modules/delayed-stream": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz", - "integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==", - "engines": { - "node": ">=0.4.0" - } - }, "node_modules/dequal": { "version": "2.0.3", "resolved": "https://registry.npmjs.org/dequal/-/dequal-2.0.3.tgz", @@ -2831,25 +2784,6 @@ "tabbable": "^6.2.0" } }, - "node_modules/follow-redirects": { - "version": "1.15.6", - "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz", - "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==", - "funding": [ - { - "type": "individual", - "url": "https://github.com/sponsors/RubenVerborgh" - } - ], - "engines": { - "node": ">=4.0" - }, - "peerDependenciesMeta": { - "debug": { - "optional": true - } - } - }, "node_modules/foreground-child": { "version": "3.2.1", "resolved": "https://registry.npmjs.org/foreground-child/-/foreground-child-3.2.1.tgz", @@ -2865,19 +2799,6 @@ "url": "https://github.com/sponsors/isaacs" } }, - "node_modules/form-data": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.0.tgz", - "integrity": "sha512-ETEklSGi5t0QMZuiXoA/Q6vcnxcLQP5vdugSpuAyi6SVGi2clPPp+xgEhuMaHC+zGgn31Kd235W35f7Hykkaww==", - "dependencies": { - "asynckit": "^0.4.0", - "combined-stream": "^1.0.8", - "mime-types": "^2.1.12" - }, - "engines": { - "node": ">= 6" - } - }, "node_modules/fraction.js": { "version": "4.3.7", "resolved": "https://registry.npmjs.org/fraction.js/-/fraction.js-4.3.7.tgz", @@ -3353,18 +3274,6 @@ "resolved": "https://registry.npmjs.org/lodash.merge/-/lodash.merge-4.6.2.tgz", "integrity": "sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ==" }, - "node_modules/loglevel": { - "version": "1.9.1", - "resolved": "https://registry.npmjs.org/loglevel/-/loglevel-1.9.1.tgz", - "integrity": "sha512-hP3I3kCrDIMuRwAwHltphhDM1r8i55H33GgqjXbrisuJhF4kRhW1dNuxsRklp4bXl8DSdLaNLuiL4A/LWRfxvg==", - "engines": { - "node": ">= 0.6.0" - }, - "funding": { - "type": "tidelift", - "url": "https://tidelift.com/funding/github/npm/loglevel" - } - }, "node_modules/lru-cache": { "version": "10.3.0", "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-10.3.0.tgz", @@ -3458,25 +3367,6 @@ "node": ">=8.6" } }, - "node_modules/mime-db": { - "version": "1.52.0", - "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz", - "integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==", - "engines": { - "node": ">= 0.6" - } - }, - "node_modules/mime-types": { - "version": "2.1.35", - "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz", - "integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==", - "dependencies": { - "mime-db": "1.52.0" - }, - "engines": { - "node": ">= 0.6" - } - }, "node_modules/min-indent": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/min-indent/-/min-indent-1.0.1.tgz", @@ -4009,11 +3899,6 @@ "svelte": "^3.2.0 || ^4.0.0-next.0" } }, - "node_modules/proxy-from-env": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz", - "integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==" - }, "node_modules/punycode": { "version": "2.3.1", "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.1.tgz", @@ -4061,12 +3946,6 @@ "node": ">=8.10.0" } }, - "node_modules/regenerator-runtime": { - "version": "0.14.1", - "resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.14.1.tgz", - "integrity": "sha512-dYnhHh0nJoMfnkZs6GmmhFknAGRrLznOu5nc9ML+EJxGvrx6H7teuevqVqCuPcPK//3eDrrjQhehXVx9cnkGdw==", - "peer": true - }, "node_modules/resolve": { "version": "1.22.8", "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.22.8.tgz", @@ -5376,21 +5255,6 @@ "node": ">=14.17" } }, - "node_modules/typesense": { - "version": "1.8.2", - "resolved": "https://registry.npmjs.org/typesense/-/typesense-1.8.2.tgz", - "integrity": "sha512-aBpePjA99Qvo+OP2pJwMpvga4Jrm1Y2oV5NsrWXBxlqUDNEUCPZBIksPv2Hq0jxQxHhLLyJVbjXjByXsvpCDVA==", - "dependencies": { - "axios": "^1.6.0", - "loglevel": "^1.8.1" - }, - "engines": { - "node": ">=18" - }, - "peerDependencies": { - "@babel/runtime": "^7.23.2" - } - }, "node_modules/undici": { "version": "5.28.4", "resolved": "https://registry.npmjs.org/undici/-/undici-5.28.4.tgz", diff --git a/web/package.json b/web/package.json index 8613054..dce42e5 100644 --- a/web/package.json +++ b/web/package.json @@ -48,7 +48,6 @@ "svelte-i18n": "^4.0.0", "svelte-sonner": "^0.3.27", "tailwind-merge": "^2.4.0", - "tailwind-variants": "^0.2.1", - "typesense": "^1.7.2" + "tailwind-variants": "^0.2.1" } }