mirror of
https://github.com/tcsenpai/pensieve.git
synced 2025-06-06 03:05:25 +00:00
1103 lines
41 KiB
Python
1103 lines
41 KiB
Python
# Standard library imports
|
|
import time
|
|
import math
|
|
import re
|
|
import os
|
|
import threading
|
|
import asyncio
|
|
import logging
|
|
import logging.config
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
from enum import Enum
|
|
from typing import List, Tuple
|
|
from functools import lru_cache
|
|
from collections import defaultdict, deque
|
|
|
|
# Third-party imports
|
|
import typer
|
|
import httpx
|
|
from tqdm import tqdm
|
|
from tabulate import tabulate
|
|
import psutil
|
|
from watchdog.observers import Observer
|
|
from watchdog.events import FileSystemEventHandler
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
# Local imports
|
|
from memos.config import settings
|
|
from memos.utils import get_image_metadata
|
|
from memos.schemas import MetadataSource
|
|
from memos.logging_config import LOGGING_CONFIG
|
|
|
|
|
|
logging.config.dictConfig(LOGGING_CONFIG)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
lib_app = typer.Typer()
|
|
|
|
file_detector = None
|
|
|
|
IS_THUMBNAIL = "is_thumbnail"
|
|
|
|
BASE_URL = settings.server_endpoint
|
|
|
|
include_files = [".jpg", ".jpeg", ".png", ".webp"]
|
|
|
|
|
|
class FileStatus(Enum):
|
|
UPDATED = "updated"
|
|
ADDED = "added"
|
|
|
|
|
|
def format_timestamp(timestamp):
|
|
if isinstance(timestamp, str):
|
|
return timestamp
|
|
return datetime.fromtimestamp(timestamp).replace(tzinfo=None).isoformat()
|
|
|
|
|
|
def init_file_detector():
|
|
"""Initialize the global file detector if not already initialized"""
|
|
global file_detector
|
|
if file_detector is None:
|
|
from magika import Magika
|
|
|
|
file_detector = Magika()
|
|
return file_detector
|
|
|
|
|
|
def get_file_type(file_path):
|
|
"""Get file type using lazy-loaded detector"""
|
|
detector = init_file_detector()
|
|
file_result = detector.identify_path(file_path)
|
|
return file_result.output.ct_label, file_result.output.group
|
|
|
|
|
|
def display_libraries(libraries):
|
|
table = []
|
|
for library in libraries:
|
|
table.append(
|
|
[
|
|
library["id"],
|
|
library["name"],
|
|
"\n".join(
|
|
f"{folder['id']}: {folder['path']}" for folder in library["folders"]
|
|
),
|
|
"\n".join(
|
|
f"{plugin['id']}: {plugin['name']} {plugin['webhook_url']}"
|
|
for plugin in library["plugins"]
|
|
),
|
|
]
|
|
)
|
|
|
|
print(
|
|
tabulate(table, headers=["ID", "Name", "Folders", "Plugins"], tablefmt="plain")
|
|
)
|
|
|
|
|
|
@lib_app.command("ls")
|
|
def ls():
|
|
response = httpx.get(f"{BASE_URL}/libraries")
|
|
libraries = response.json()
|
|
display_libraries(libraries)
|
|
|
|
|
|
@lib_app.command("create")
|
|
def add(name: str, folders: List[str]):
|
|
absolute_folders = []
|
|
for folder in folders:
|
|
folder_path = Path(folder).resolve()
|
|
absolute_folders.append(
|
|
{
|
|
"path": str(folder_path),
|
|
"last_modified_at": datetime.fromtimestamp(
|
|
folder_path.stat().st_mtime
|
|
).isoformat(),
|
|
}
|
|
)
|
|
|
|
response = httpx.post(
|
|
f"{BASE_URL}/libraries", json={"name": name, "folders": absolute_folders}
|
|
)
|
|
if 200 <= response.status_code < 300:
|
|
print("Library created successfully")
|
|
else:
|
|
print(f"Failed to create library: {response.status_code} - {response.text}")
|
|
|
|
|
|
@lib_app.command("add-folder")
|
|
def add_folder(library_id: int, folders: List[str]):
|
|
absolute_folders = []
|
|
for folder in folders:
|
|
folder_path = Path(folder).resolve()
|
|
absolute_folders.append(
|
|
{
|
|
"path": str(folder_path),
|
|
"last_modified_at": datetime.fromtimestamp(
|
|
folder_path.stat().st_mtime
|
|
).isoformat(),
|
|
}
|
|
)
|
|
|
|
response = httpx.post(
|
|
f"{BASE_URL}/libraries/{library_id}/folders",
|
|
json={"folders": absolute_folders},
|
|
)
|
|
if 200 <= response.status_code < 300:
|
|
print("Folders added successfully")
|
|
library = response.json()
|
|
display_libraries([library])
|
|
else:
|
|
print(f"Failed to add folders: {response.status_code} - {response.text}")
|
|
|
|
|
|
@lib_app.command("show")
|
|
def show(library_id: int):
|
|
response = httpx.get(f"{BASE_URL}/libraries/{library_id}")
|
|
if response.status_code == 200:
|
|
library = response.json()
|
|
display_libraries([library])
|
|
else:
|
|
print(f"Failed to retrieve library: {response.status_code} - {response.text}")
|
|
|
|
|
|
def is_temp_file(filename):
|
|
return (
|
|
filename.startswith(".")
|
|
or filename.startswith("tmp")
|
|
or filename.startswith("temp")
|
|
)
|
|
|
|
|
|
async def loop_files(library_id, folder, folder_path, force, plugins, batch_size):
|
|
updated_file_count = 0
|
|
added_file_count = 0
|
|
scanned_files = set()
|
|
semaphore = asyncio.Semaphore(batch_size)
|
|
|
|
async with httpx.AsyncClient(timeout=60) as client:
|
|
tasks = []
|
|
for root, _, files in os.walk(folder_path):
|
|
with tqdm(total=len(files), desc=f"Scanning {root}", leave=True) as pbar:
|
|
candidate_files = []
|
|
for file in files:
|
|
file_path = Path(root) / file
|
|
absolute_file_path = file_path.resolve() # Get absolute path
|
|
relative_path = absolute_file_path.relative_to(folder_path)
|
|
|
|
# Check if the file extension is in the include_files list and not a temp file
|
|
if file_path.suffix.lower() in include_files and not is_temp_file(
|
|
file
|
|
):
|
|
scanned_files.add(str(absolute_file_path))
|
|
candidate_files.append(str(absolute_file_path))
|
|
|
|
batching = 200
|
|
for i in range(0, len(candidate_files), batching):
|
|
batch = candidate_files[i : i + batching]
|
|
|
|
# Get batch of entities
|
|
get_response = await client.post(
|
|
f"{BASE_URL}/libraries/{library_id}/entities/by-filepaths",
|
|
json=batch,
|
|
)
|
|
|
|
if get_response.status_code == 200:
|
|
existing_entities = get_response.json()
|
|
else:
|
|
print(
|
|
f"Failed to get entities: {get_response.status_code} - {get_response.text}"
|
|
)
|
|
continue
|
|
|
|
existing_entities_dict = {
|
|
entity["filepath"]: entity for entity in existing_entities
|
|
}
|
|
|
|
for file_path in batch:
|
|
absolute_file_path = Path(file_path).resolve()
|
|
file_stat = absolute_file_path.stat()
|
|
file_type, file_type_group = get_file_type(absolute_file_path)
|
|
|
|
new_entity = {
|
|
"filename": absolute_file_path.name,
|
|
"filepath": str(absolute_file_path),
|
|
"size": file_stat.st_size,
|
|
"file_created_at": format_timestamp(file_stat.st_ctime),
|
|
"file_last_modified_at": format_timestamp(
|
|
file_stat.st_mtime
|
|
),
|
|
"file_type": file_type,
|
|
"file_type_group": file_type_group,
|
|
"folder_id": folder["id"],
|
|
}
|
|
|
|
is_thumbnail = False
|
|
|
|
if file_type_group == "image":
|
|
metadata = get_image_metadata(absolute_file_path)
|
|
if metadata:
|
|
if (
|
|
"active_window" in metadata
|
|
and "active_app" not in metadata
|
|
):
|
|
metadata["active_app"] = metadata[
|
|
"active_window"
|
|
].split(" - ")[0]
|
|
new_entity["metadata_entries"] = [
|
|
{
|
|
"key": key,
|
|
"value": str(value),
|
|
"source": MetadataSource.SYSTEM_GENERATED.value,
|
|
"data_type": (
|
|
"number"
|
|
if isinstance(value, (int, float))
|
|
else "text"
|
|
),
|
|
}
|
|
for key, value in metadata.items()
|
|
if key != IS_THUMBNAIL
|
|
]
|
|
if "active_app" in metadata:
|
|
new_entity.setdefault("tags", []).append(
|
|
metadata["active_app"]
|
|
)
|
|
is_thumbnail = metadata.get(IS_THUMBNAIL, False)
|
|
|
|
existing_entity = existing_entities_dict.get(
|
|
str(absolute_file_path)
|
|
)
|
|
if existing_entity:
|
|
existing_created_at = format_timestamp(
|
|
existing_entity["file_created_at"]
|
|
)
|
|
new_created_at = format_timestamp(
|
|
new_entity["file_created_at"]
|
|
)
|
|
existing_modified_at = format_timestamp(
|
|
existing_entity["file_last_modified_at"]
|
|
)
|
|
new_modified_at = format_timestamp(
|
|
new_entity["file_last_modified_at"]
|
|
)
|
|
|
|
# Ignore file changes for thumbnails
|
|
if is_thumbnail:
|
|
new_entity["file_created_at"] = existing_entity[
|
|
"file_created_at"
|
|
]
|
|
new_entity["file_last_modified_at"] = existing_entity[
|
|
"file_last_modified_at"
|
|
]
|
|
new_entity["file_type"] = existing_entity["file_type"]
|
|
new_entity["file_type_group"] = existing_entity[
|
|
"file_type_group"
|
|
]
|
|
new_entity["size"] = existing_entity["size"]
|
|
|
|
# Merge existing metadata with new metadata
|
|
if new_entity.get("metadata_entries"):
|
|
new_metadata_keys = {
|
|
entry["key"]
|
|
for entry in new_entity["metadata_entries"]
|
|
}
|
|
for existing_entry in existing_entity[
|
|
"metadata_entries"
|
|
]:
|
|
if existing_entry["key"] not in new_metadata_keys:
|
|
new_entity["metadata_entries"].append(
|
|
existing_entry
|
|
)
|
|
|
|
if (
|
|
force
|
|
or existing_created_at != new_created_at
|
|
or existing_modified_at != new_modified_at
|
|
):
|
|
tasks.append(
|
|
update_entity(
|
|
client,
|
|
semaphore,
|
|
plugins,
|
|
new_entity,
|
|
existing_entity,
|
|
)
|
|
)
|
|
elif not is_thumbnail: # Ignore thumbnails
|
|
tasks.append(
|
|
add_entity(
|
|
client, semaphore, library_id, plugins, new_entity
|
|
)
|
|
)
|
|
pbar.update(len(batch))
|
|
pbar.set_postfix({"Candidates": len(tasks)}, refresh=True)
|
|
|
|
# Process all tasks after they've been created
|
|
for future in tqdm(
|
|
asyncio.as_completed(tasks),
|
|
desc=f"Processing {folder_path}",
|
|
total=len(tasks),
|
|
leave=True,
|
|
):
|
|
file_path, file_status, succeeded, response = await future
|
|
if file_status == FileStatus.ADDED:
|
|
if succeeded:
|
|
added_file_count += 1
|
|
tqdm.write(f"Added file to library: {file_path}")
|
|
else:
|
|
error_message = "Failed to add file"
|
|
if hasattr(response, "status_code"):
|
|
error_message += f": {response.status_code}"
|
|
if hasattr(response, "text"):
|
|
error_message += f" - {response.text}"
|
|
else:
|
|
error_message += " - Unknown error occurred"
|
|
tqdm.write(error_message)
|
|
elif file_status == FileStatus.UPDATED:
|
|
if succeeded:
|
|
updated_file_count += 1
|
|
tqdm.write(f"Updated file in library: {file_path}")
|
|
else:
|
|
error_message = "Failed to update file"
|
|
if hasattr(response, "status_code"):
|
|
error_message += f": {response.status_code}"
|
|
elif hasattr(response, "text"):
|
|
error_message += f" - {response.text}"
|
|
else:
|
|
error_message += f" - Unknown error occurred"
|
|
tqdm.write(error_message)
|
|
|
|
return added_file_count, updated_file_count, scanned_files
|
|
|
|
|
|
@lib_app.command("scan")
|
|
def scan(
|
|
library_id: int,
|
|
path: str = typer.Argument(None, help="Path to scan within the library"),
|
|
force: bool = False,
|
|
plugins: List[int] = typer.Option(None, "--plugin", "-p"),
|
|
folders: List[int] = typer.Option(None, "--folder", "-f"),
|
|
batch_size: int = typer.Option(1, "--batch-size", "-bs", help="Batch size for processing files"),
|
|
):
|
|
# Check if both path and folders are provided
|
|
if path and folders:
|
|
print("Error: You cannot specify both a path and folders at the same time.")
|
|
return
|
|
|
|
response = httpx.get(f"{BASE_URL}/libraries/{library_id}")
|
|
if response.status_code != 200:
|
|
print(f"Failed to retrieve library: {response.status_code} - {response.text}")
|
|
return
|
|
|
|
library = response.json()
|
|
total_files_added = 0
|
|
total_files_updated = 0
|
|
total_files_deleted = 0
|
|
|
|
# Filter folders if the folders parameter is provided
|
|
if folders:
|
|
library_folders = [
|
|
folder for folder in library["folders"] if folder["id"] in folders
|
|
]
|
|
else:
|
|
library_folders = library["folders"]
|
|
|
|
# Check if a specific path is provided
|
|
if path:
|
|
path = Path(path).expanduser().resolve()
|
|
# Check if the path is a folder or a subdirectory of a library folder
|
|
folder = next(
|
|
(
|
|
folder
|
|
for folder in library_folders
|
|
if path.is_relative_to(Path(folder["path"]).resolve())
|
|
),
|
|
None,
|
|
)
|
|
if not folder:
|
|
print(f"Error: The path {path} is not part of any folder in the library.")
|
|
return
|
|
# Only scan the specified path
|
|
library_folders = [{"id": folder["id"], "path": str(path)}]
|
|
|
|
for folder in library_folders:
|
|
folder_path = Path(folder["path"])
|
|
if not folder_path.exists() or not folder_path.is_dir():
|
|
tqdm.write(f"Folder does not exist or is not a directory: {folder_path}")
|
|
continue
|
|
|
|
added_file_count, updated_file_count, scanned_files = asyncio.run(
|
|
loop_files(library_id, folder, folder_path, force, plugins, batch_size)
|
|
)
|
|
total_files_added += added_file_count
|
|
total_files_updated += updated_file_count
|
|
|
|
# Check for deleted files
|
|
limit = 100
|
|
offset = 0
|
|
total_entities = 0 # We'll update this after the first request
|
|
with tqdm(
|
|
total=total_entities, desc="Checking for deleted files", leave=True
|
|
) as pbar2:
|
|
while True:
|
|
existing_files_response = httpx.get(
|
|
f"{BASE_URL}/libraries/{library_id}/folders/{folder['id']}/entities",
|
|
params={"limit": limit, "offset": offset},
|
|
timeout=60,
|
|
)
|
|
if existing_files_response.status_code != 200:
|
|
pbar2.write(
|
|
f"Failed to retrieve existing files: {existing_files_response.status_code} - {existing_files_response.text}"
|
|
)
|
|
break
|
|
|
|
existing_files = existing_files_response.json()
|
|
if not existing_files:
|
|
break
|
|
|
|
# Update total if this is the first request
|
|
if offset == 0:
|
|
total_entities = int(
|
|
existing_files_response.headers.get(
|
|
"X-Total-Count", total_entities
|
|
)
|
|
)
|
|
pbar2.total = total_entities
|
|
pbar2.refresh()
|
|
|
|
for existing_file in existing_files:
|
|
if (
|
|
Path(existing_file["filepath"]).is_relative_to(folder_path)
|
|
and existing_file["filepath"] not in scanned_files
|
|
):
|
|
# File has been deleted
|
|
delete_response = httpx.delete(
|
|
f"{BASE_URL}/libraries/{library_id}/entities/{existing_file['id']}"
|
|
)
|
|
if 200 <= delete_response.status_code < 300:
|
|
pbar2.write(
|
|
f"Deleted file from library: {existing_file['filepath']}"
|
|
)
|
|
total_files_deleted += 1
|
|
else:
|
|
pbar2.write(
|
|
f"Failed to delete file: {delete_response.status_code} - {delete_response.text}"
|
|
)
|
|
pbar2.update(1)
|
|
|
|
offset += limit
|
|
|
|
print(f"Total files added: {total_files_added}")
|
|
print(f"Total files updated: {total_files_updated}")
|
|
print(f"Total files deleted: {total_files_deleted}")
|
|
|
|
|
|
async def add_entity(
|
|
client: httpx.AsyncClient,
|
|
semaphore: asyncio.Semaphore,
|
|
library_id,
|
|
plugins,
|
|
new_entity,
|
|
) -> Tuple[FileStatus, bool, httpx.Response]:
|
|
async with semaphore:
|
|
MAX_RETRIES = 3
|
|
RETRY_DELAY = 2.0
|
|
for attempt in range(MAX_RETRIES):
|
|
try:
|
|
post_response = await client.post(
|
|
f"{BASE_URL}/libraries/{library_id}/entities",
|
|
json=new_entity,
|
|
params={
|
|
"plugins": plugins,
|
|
"update_index": "true"
|
|
} if plugins else {"update_index": "true"},
|
|
timeout=60,
|
|
)
|
|
if 200 <= post_response.status_code < 300:
|
|
return new_entity["filepath"], FileStatus.ADDED, True, post_response
|
|
else:
|
|
return (
|
|
new_entity["filepath"],
|
|
FileStatus.ADDED,
|
|
False,
|
|
post_response,
|
|
)
|
|
except Exception as e:
|
|
logging.error(
|
|
f"Error while adding entity (attempt {attempt + 1}/{MAX_RETRIES}): {e}"
|
|
)
|
|
if attempt < MAX_RETRIES - 1:
|
|
await asyncio.sleep(RETRY_DELAY)
|
|
else:
|
|
return new_entity["filepath"], FileStatus.ADDED, False, None
|
|
|
|
|
|
async def update_entity(
|
|
client: httpx.AsyncClient,
|
|
semaphore: asyncio.Semaphore,
|
|
plugins,
|
|
new_entity,
|
|
existing_entity,
|
|
) -> Tuple[FileStatus, bool, httpx.Response]:
|
|
MAX_RETRIES = 3
|
|
RETRY_DELAY = 2.0
|
|
async with semaphore:
|
|
for attempt in range(MAX_RETRIES):
|
|
try:
|
|
update_response = await client.put(
|
|
f"{BASE_URL}/entities/{existing_entity['id']}",
|
|
json=new_entity,
|
|
params={
|
|
"trigger_webhooks_flag": "true",
|
|
"update_index": "true",
|
|
**({"plugins": plugins} if plugins else {}),
|
|
},
|
|
timeout=60,
|
|
)
|
|
if 200 <= update_response.status_code < 300:
|
|
return (
|
|
new_entity["filepath"],
|
|
FileStatus.UPDATED,
|
|
True,
|
|
update_response,
|
|
)
|
|
else:
|
|
return (
|
|
new_entity["filepath"],
|
|
FileStatus.UPDATED,
|
|
False,
|
|
update_response,
|
|
)
|
|
except Exception as e:
|
|
logging.error(
|
|
f"Error while updating entity {existing_entity['id']} (attempt {attempt + 1}/{MAX_RETRIES}): {e}"
|
|
)
|
|
if attempt < MAX_RETRIES - 1:
|
|
await asyncio.sleep(RETRY_DELAY)
|
|
else:
|
|
return new_entity["filepath"], FileStatus.UPDATED, False, None
|
|
|
|
|
|
@lib_app.command("reindex")
|
|
def reindex(
|
|
library_id: int,
|
|
folders: List[int] = typer.Option(None, "--folder", "-f"),
|
|
force: bool = typer.Option(
|
|
False, "--force", help="Force recreate FTS and vector tables before reindexing"
|
|
),
|
|
batch_size: int = typer.Option(1, "--batch-size", "-bs", help="Batch size for processing entities"),
|
|
):
|
|
print(f"Reindexing library {library_id}")
|
|
|
|
from memos.models import recreate_fts_and_vec_tables
|
|
|
|
# Get the library
|
|
response = httpx.get(f"{BASE_URL}/libraries/{library_id}")
|
|
if response.status_code != 200:
|
|
print(f"Failed to get library: {response.status_code} - {response.text}")
|
|
return
|
|
|
|
library = response.json()
|
|
scanned_entities = set()
|
|
|
|
# Filter folders if the folders parameter is provided
|
|
if folders:
|
|
library_folders = [
|
|
folder for folder in library["folders"] if folder["id"] in folders
|
|
]
|
|
else:
|
|
library_folders = library["folders"]
|
|
|
|
if force:
|
|
print("Force flag is set. Recreating FTS and vector tables...")
|
|
recreate_fts_and_vec_tables()
|
|
print("FTS and vector tables have been recreated.")
|
|
|
|
with httpx.Client() as client:
|
|
total_entities = 0
|
|
|
|
# Get total entity count for all folders
|
|
for folder in library_folders:
|
|
response = client.get(
|
|
f"{BASE_URL}/libraries/{library_id}/folders/{folder['id']}/entities",
|
|
params={"limit": 1, "offset": 0},
|
|
)
|
|
if response.status_code == 200:
|
|
total_entities += int(response.headers.get("X-Total-Count", 0))
|
|
else:
|
|
print(
|
|
f"Failed to get entity count for folder {folder['id']}: {response.status_code} - {response.text}"
|
|
)
|
|
|
|
# Now process entities with a progress bar
|
|
with tqdm(total=total_entities, desc="Reindexing entities") as pbar:
|
|
for folder in library_folders:
|
|
print(f"Processing folder: {folder['id']}")
|
|
|
|
# List all entities in the folder
|
|
limit = 200
|
|
offset = 0
|
|
while True:
|
|
entities_response = client.get(
|
|
f"{BASE_URL}/libraries/{library_id}/folders/{folder['id']}/entities",
|
|
params={"limit": limit, "offset": offset},
|
|
)
|
|
if entities_response.status_code != 200:
|
|
print(
|
|
f"Failed to get entities: {entities_response.status_code} - {entities_response.text}"
|
|
)
|
|
break
|
|
|
|
entities = entities_response.json()
|
|
if not entities:
|
|
break
|
|
|
|
# 收集需要处理的实体 ID
|
|
entity_ids = [
|
|
entity["id"]
|
|
for entity in entities
|
|
if entity["id"] not in scanned_entities
|
|
]
|
|
|
|
# 按 batch_size 分批处理
|
|
for i in range(0, len(entity_ids), batch_size):
|
|
batch_ids = entity_ids[i:i + batch_size]
|
|
if batch_ids:
|
|
batch_response = client.post(
|
|
f"{BASE_URL}/entities/batch-index",
|
|
json={"entity_ids": batch_ids},
|
|
timeout=60,
|
|
)
|
|
if batch_response.status_code != 204:
|
|
print(
|
|
f"Failed to update batch: {batch_response.status_code} - {batch_response.text}"
|
|
)
|
|
pbar.update(len(batch_ids))
|
|
scanned_entities.update(batch_ids)
|
|
|
|
offset += limit
|
|
|
|
print(f"Reindexing completed for library {library_id}")
|
|
|
|
|
|
@lib_app.command("sync")
|
|
def sync(
|
|
library_id: int,
|
|
filepath: str,
|
|
force: bool = typer.Option(
|
|
False, "--force", "-f", help="Force update the file even if it hasn't changed"
|
|
),
|
|
without_webhooks: bool = typer.Option(
|
|
False, "--no-plugins", help="Disable plugin triggers", is_flag=True
|
|
),
|
|
):
|
|
"""
|
|
Sync a specific file with the library.
|
|
"""
|
|
# 1. Get library by id and check if it exists
|
|
response = httpx.get(f"{BASE_URL}/libraries/{library_id}")
|
|
if response.status_code != 200:
|
|
typer.echo(f"Error: Library with id {library_id} not found.")
|
|
raise typer.Exit(code=1)
|
|
|
|
library = response.json()
|
|
|
|
# Convert filepath to absolute path
|
|
file_path = Path(filepath).resolve()
|
|
|
|
if not file_path.is_file():
|
|
typer.echo(f"Error: File {file_path} does not exist.")
|
|
raise typer.Exit(code=1)
|
|
|
|
# 2. Check if the file exists in the library
|
|
response = httpx.get(
|
|
f"{BASE_URL}/libraries/{library_id}/entities/by-filepath",
|
|
params={"filepath": str(file_path)},
|
|
)
|
|
|
|
file_stat = file_path.stat()
|
|
file_type, file_type_group = get_file_type(file_path)
|
|
|
|
new_entity = {
|
|
"filename": file_path.name,
|
|
"filepath": str(file_path),
|
|
"size": file_stat.st_size,
|
|
"file_created_at": format_timestamp(file_stat.st_ctime),
|
|
"file_last_modified_at": format_timestamp(file_stat.st_mtime),
|
|
"file_type": file_type,
|
|
"file_type_group": file_type_group,
|
|
}
|
|
|
|
# Handle metadata
|
|
is_thumbnail = False
|
|
if file_type_group == "image":
|
|
metadata = get_image_metadata(file_path)
|
|
if metadata:
|
|
if "active_window" in metadata and "active_app" not in metadata:
|
|
metadata["active_app"] = metadata["active_window"].split(" - ")[0]
|
|
new_entity["metadata_entries"] = [
|
|
{
|
|
"key": key,
|
|
"value": str(value),
|
|
"source": MetadataSource.SYSTEM_GENERATED.value,
|
|
"data_type": (
|
|
"number" if isinstance(value, (int, float)) else "text"
|
|
),
|
|
}
|
|
for key, value in metadata.items()
|
|
if key != IS_THUMBNAIL
|
|
]
|
|
if "active_app" in metadata:
|
|
new_entity.setdefault("tags", []).append(metadata["active_app"])
|
|
is_thumbnail = metadata.get(IS_THUMBNAIL, False)
|
|
|
|
if response.status_code == 200:
|
|
# File exists, update it
|
|
existing_entity = response.json()
|
|
new_entity["folder_id"] = existing_entity["folder_id"]
|
|
|
|
if is_thumbnail:
|
|
new_entity["file_created_at"] = existing_entity["file_created_at"]
|
|
new_entity["file_last_modified_at"] = existing_entity[
|
|
"file_last_modified_at"
|
|
]
|
|
new_entity["file_type"] = existing_entity["file_type"]
|
|
new_entity["file_type_group"] = existing_entity["file_type_group"]
|
|
new_entity["size"] = existing_entity["size"]
|
|
|
|
# Merge existing metadata with new metadata
|
|
if new_entity.get("metadata_entries"):
|
|
new_metadata_keys = {
|
|
entry["key"] for entry in new_entity["metadata_entries"]
|
|
}
|
|
for existing_entry in existing_entity["metadata_entries"]:
|
|
if existing_entry["key"] not in new_metadata_keys:
|
|
new_entity["metadata_entries"].append(existing_entry)
|
|
|
|
if force or (
|
|
existing_entity["file_last_modified_at"]
|
|
!= new_entity["file_last_modified_at"]
|
|
or existing_entity["size"] != new_entity["size"]
|
|
):
|
|
update_response = httpx.put(
|
|
f"{BASE_URL}/entities/{existing_entity['id']}",
|
|
json=new_entity,
|
|
params={
|
|
"trigger_webhooks_flag": str(not without_webhooks).lower(),
|
|
"update_index": "true",
|
|
},
|
|
timeout=60,
|
|
)
|
|
if update_response.status_code == 200:
|
|
typer.echo(f"Updated file: {file_path}")
|
|
else:
|
|
typer.echo(
|
|
f"Error updating file: {update_response.status_code} - {update_response.text}"
|
|
)
|
|
else:
|
|
typer.echo(f"File {file_path} is up to date. No changes made.")
|
|
|
|
else:
|
|
# 3. File doesn't exist, check if it belongs to a folder in the library
|
|
folder = next(
|
|
(
|
|
folder
|
|
for folder in library["folders"]
|
|
if str(file_path).startswith(folder["path"])
|
|
),
|
|
None,
|
|
)
|
|
|
|
if folder:
|
|
# Create new entity
|
|
new_entity["folder_id"] = folder["id"]
|
|
|
|
create_response = httpx.post(
|
|
f"{BASE_URL}/libraries/{library_id}/entities",
|
|
json=new_entity,
|
|
params={
|
|
"trigger_webhooks_flag": str(not without_webhooks).lower(),
|
|
"update_index": "true",
|
|
},
|
|
timeout=60,
|
|
)
|
|
|
|
if create_response.status_code == 200:
|
|
typer.echo(f"Created new entity for file: {file_path}")
|
|
else:
|
|
typer.echo(
|
|
f"Error creating entity: {create_response.status_code} - {create_response.text}"
|
|
)
|
|
|
|
else:
|
|
# 4. File doesn't belong to any folder in the library
|
|
typer.echo(
|
|
f"Error: File {file_path} does not belong to any folder in the library."
|
|
)
|
|
raise typer.Exit(code=1)
|
|
|
|
|
|
@lru_cache(maxsize=1)
|
|
def is_on_battery():
|
|
try:
|
|
battery = psutil.sensors_battery()
|
|
return battery is not None and not battery.power_plugged
|
|
except:
|
|
return False # If unable to detect battery status, assume not on battery
|
|
|
|
|
|
# Modify the LibraryFileHandler class
|
|
class LibraryFileHandler(FileSystemEventHandler):
|
|
def __init__(
|
|
self,
|
|
library_id,
|
|
include_files,
|
|
max_workers=2,
|
|
sparsity_factor=3,
|
|
window_size=10,
|
|
):
|
|
self.library_id = library_id
|
|
self.include_files = include_files
|
|
self.inode_pattern = re.compile(r"\._.+")
|
|
self.pending_files = defaultdict(lambda: {"timestamp": 0, "last_size": 0})
|
|
self.buffer_time = 2
|
|
self.executor = ThreadPoolExecutor(max_workers=max_workers)
|
|
self.lock = threading.Lock()
|
|
|
|
self.sparsity_window = 12
|
|
self.sparsity_factor = sparsity_factor
|
|
self.window_size = window_size
|
|
|
|
self.pending_times = deque(maxlen=window_size)
|
|
self.sync_times = deque(maxlen=window_size)
|
|
|
|
self.file_count = 0
|
|
self.file_submitted = 0
|
|
self.file_synced = 0
|
|
self.file_skipped = 0
|
|
self.logger = logger
|
|
|
|
self.base_sparsity_window = self.sparsity_window
|
|
self.last_battery_check = 0
|
|
self.battery_check_interval = 60 # Check battery status every 60 seconds
|
|
|
|
def handle_event(self, event):
|
|
if not event.is_directory and self.is_valid_file(event.src_path):
|
|
current_time = time.time()
|
|
with self.lock:
|
|
file_info = self.pending_files[event.src_path]
|
|
|
|
if current_time - file_info["timestamp"] > self.buffer_time:
|
|
file_info["timestamp"] = current_time
|
|
self.pending_times.append(current_time)
|
|
|
|
file_info["last_size"] = os.path.getsize(event.src_path)
|
|
|
|
return True
|
|
return False
|
|
|
|
def process_pending_files(self):
|
|
current_time = time.time()
|
|
files_to_process_with_plugins = []
|
|
files_to_process_without_plugins = []
|
|
processed_in_current_loop = 0
|
|
with self.lock:
|
|
for path, file_info in list(self.pending_files.items()):
|
|
if current_time - file_info["timestamp"] > self.buffer_time:
|
|
processed_in_current_loop += 1
|
|
if os.path.exists(path) and os.path.getsize(path) > 0:
|
|
self.file_count += 1
|
|
if self.file_count % self.sparsity_window == 0:
|
|
files_to_process_with_plugins.append(path)
|
|
print(
|
|
f"file_count % sparsity_window: {self.file_count} % {self.sparsity_window} == 0"
|
|
)
|
|
print(f"Picked file for processing with plugins: {path}")
|
|
else:
|
|
files_to_process_without_plugins.append(path)
|
|
self.file_skipped += 1
|
|
del self.pending_files[path]
|
|
elif not os.path.exists(path):
|
|
del self.pending_files[path]
|
|
|
|
# Process files with plugins - these count as submitted
|
|
for path in files_to_process_with_plugins:
|
|
self.executor.submit(self.process_file, path, False)
|
|
self.file_submitted += 1
|
|
|
|
# Process files without plugins - these don't count as submitted
|
|
for path in files_to_process_without_plugins:
|
|
self.executor.submit(self.process_file, path, True)
|
|
|
|
if processed_in_current_loop > 0:
|
|
self.logger.info(
|
|
f"File count: {self.file_count}, Files submitted: {self.file_submitted}, Files synced: {self.file_synced}, Files skipped: {self.file_skipped}"
|
|
)
|
|
|
|
self.update_sparsity_window()
|
|
|
|
def process_file(self, path, no_plugins):
|
|
self.logger.debug(f"Processing file: {path} (with plugins: {not no_plugins})")
|
|
start_time = time.time()
|
|
sync(self.library_id, path, without_webhooks=no_plugins)
|
|
end_time = time.time()
|
|
if not no_plugins:
|
|
with self.lock:
|
|
self.sync_times.append(end_time - start_time)
|
|
self.file_synced += 1
|
|
|
|
def update_sparsity_window(self):
|
|
min_samples = max(3, self.window_size // 3)
|
|
max_interval = 60 # Maximum allowed interval between events in seconds
|
|
|
|
if (
|
|
len(self.pending_times) >= min_samples
|
|
and len(self.sync_times) >= min_samples
|
|
):
|
|
# Filter out large time gaps
|
|
filtered_intervals = [
|
|
self.pending_times[i] - self.pending_times[i - 1]
|
|
for i in range(1, len(self.pending_times))
|
|
if self.pending_times[i] - self.pending_times[i - 1] <= max_interval
|
|
]
|
|
|
|
if filtered_intervals:
|
|
avg_interval = sum(filtered_intervals) / len(filtered_intervals)
|
|
pending_files_per_second = 1 / avg_interval if avg_interval > 0 else 0
|
|
else:
|
|
pending_files_per_second = 0
|
|
|
|
sync_time_total = sum(self.sync_times)
|
|
sync_files_per_second = (
|
|
len(self.sync_times) / sync_time_total if sync_time_total > 0 else 0
|
|
)
|
|
|
|
if pending_files_per_second > 0 and sync_files_per_second > 0:
|
|
rate = pending_files_per_second / sync_files_per_second
|
|
new_sparsity_window = max(1, math.ceil(self.sparsity_factor * rate))
|
|
|
|
current_time = time.time()
|
|
if current_time - self.last_battery_check > self.battery_check_interval:
|
|
self.last_battery_check = current_time
|
|
is_on_battery.cache_clear() # Clear the cache to get fresh battery status
|
|
new_sparsity_window = (
|
|
new_sparsity_window * 2 if is_on_battery() else new_sparsity_window
|
|
)
|
|
|
|
if new_sparsity_window != self.sparsity_window:
|
|
old_sparsity_window = self.sparsity_window
|
|
self.sparsity_window = new_sparsity_window
|
|
self.logger.info(
|
|
f"Updated sparsity window: {old_sparsity_window} -> {self.sparsity_window}"
|
|
)
|
|
self.logger.debug(
|
|
f"Pending files per second: {pending_files_per_second:.2f}"
|
|
)
|
|
self.logger.debug(
|
|
f"Sync files per second: {sync_files_per_second:.2f}"
|
|
)
|
|
self.logger.debug(f"Rate (pending/sync): {rate:.2f}")
|
|
|
|
def is_valid_file(self, path):
|
|
filename = os.path.basename(path)
|
|
return (
|
|
any(path.lower().endswith(ext) for ext in self.include_files)
|
|
and not is_temp_file(filename)
|
|
and not self.inode_pattern.match(filename)
|
|
)
|
|
|
|
def on_created(self, event):
|
|
self.handle_event(event)
|
|
|
|
def on_modified(self, event):
|
|
self.handle_event(event)
|
|
|
|
def on_moved(self, event):
|
|
if self.handle_event(event):
|
|
# For moved events, we need to update the key in pending_files
|
|
with self.lock:
|
|
self.pending_files[event.dest_path] = self.pending_files.pop(
|
|
event.src_path, {"timestamp": time.time(), "last_size": 0}
|
|
)
|
|
|
|
def on_deleted(self, event):
|
|
if self.is_valid_file(event.src_path):
|
|
self.logger.info(f"File deleted: {event.src_path}")
|
|
# Remove from pending files if it was there
|
|
with self.lock:
|
|
self.pending_files.pop(event.src_path, None)
|
|
# Add logic for handling deleted files if needed
|
|
|
|
|
|
@lib_app.command("watch")
|
|
def watch(
|
|
library_id: int,
|
|
folders: List[int] = typer.Option(
|
|
None, "--folder", "-f", help="Specify folders to watch"
|
|
),
|
|
sparsity_factor: float = typer.Option(
|
|
3.0, "--sparsity-factor", "-sf", help="Sparsity factor for file processing"
|
|
),
|
|
window_size: int = typer.Option(
|
|
10, "--window-size", "-ws", help="Window size for rate calculation"
|
|
),
|
|
verbose: bool = typer.Option(
|
|
False, "--verbose", "-v", help="Enable verbose logging"
|
|
),
|
|
):
|
|
"""
|
|
Watch for file changes in the library folders and sync automatically.
|
|
"""
|
|
# Set the logging level based on the verbose flag
|
|
log_level = "DEBUG" if verbose else "INFO"
|
|
logger.setLevel(log_level)
|
|
|
|
logger.info(f"Watching library {library_id} for changes...")
|
|
|
|
# Get the library
|
|
response = httpx.get(f"{BASE_URL}/libraries/{library_id}")
|
|
if response.status_code != 200:
|
|
print(f"Error: Library with id {library_id} not found.")
|
|
raise typer.Exit(code=1)
|
|
|
|
library = response.json()
|
|
|
|
# Filter folders if the folders parameter is provided
|
|
if folders:
|
|
library_folders = [
|
|
folder for folder in library["folders"] if folder["id"] in folders
|
|
]
|
|
else:
|
|
library_folders = library["folders"]
|
|
|
|
if not library_folders:
|
|
print("No folders to watch.")
|
|
return
|
|
|
|
# Create an observer and handler for each folder in the library
|
|
observer = Observer()
|
|
handlers = []
|
|
for folder in library_folders:
|
|
folder_path = Path(folder["path"])
|
|
event_handler = LibraryFileHandler(
|
|
library_id,
|
|
include_files,
|
|
sparsity_factor=sparsity_factor,
|
|
window_size=window_size,
|
|
)
|
|
handlers.append(event_handler)
|
|
observer.schedule(event_handler, str(folder_path), recursive=True)
|
|
print(f"Watching folder: {folder_path}")
|
|
|
|
observer.start()
|
|
try:
|
|
while True:
|
|
time.sleep(5)
|
|
for handler in handlers:
|
|
handler.process_pending_files()
|
|
except KeyboardInterrupt:
|
|
observer.stop()
|
|
for handler in handlers:
|
|
handler.executor.shutdown(wait=True)
|
|
observer.join()
|