feat(indexing): do not repeatly indexing updated entity by last_scan_at

This commit is contained in:
arkohut 2024-08-29 13:57:46 +08:00
parent 439adfa955
commit 9db92908a6

View File

@ -15,6 +15,7 @@ from tabulate import tabulate
from tqdm import tqdm from tqdm import tqdm
from enum import Enum from enum import Enum
from magika import Magika from magika import Magika
from .config import settings
IS_THUMBNAIL = "is_thumbnail" IS_THUMBNAIL = "is_thumbnail"
@ -28,7 +29,7 @@ app.add_typer(lib_app, name="lib")
file_detector = Magika() file_detector = Magika()
BASE_URL = "http://localhost:8080" BASE_URL = f"http://localhost:{settings.server_port}"
ignore_files = [".DS_Store", ".screen_sequences", "worklog"] ignore_files = [".DS_Store", ".screen_sequences", "worklog"]
@ -136,7 +137,7 @@ async def loop_files(library_id, folder, folder_path, force, plugins):
updated_file_count = 0 updated_file_count = 0
added_file_count = 0 added_file_count = 0
scanned_files = set() scanned_files = set()
semaphore = asyncio.Semaphore(8) semaphore = asyncio.Semaphore(4)
async with httpx.AsyncClient(timeout=60) as client: async with httpx.AsyncClient(timeout=60) as client:
tasks = [] tasks = []
for root, _, files in os.walk(folder_path): for root, _, files in os.walk(folder_path):
@ -507,6 +508,34 @@ async def update_entity(
return new_entity["filepath"], FileStatus.UPDATED, False, None return new_entity["filepath"], FileStatus.UPDATED, False, None
async def check_and_index_entity(client, entity_id, entity_last_scan_at):
try:
index_response = await client.get(f"{BASE_URL}/entities/{entity_id}/index")
if index_response.status_code == 200:
index_data = index_response.json()
if index_data["last_scan_at"] is None:
return entity_last_scan_at is not None
index_last_scan_at = datetime.fromtimestamp(index_data["last_scan_at"])
entity_last_scan_at = datetime.fromisoformat(entity_last_scan_at)
if index_last_scan_at >= entity_last_scan_at:
return False # Index is up to date, no need to update
return True # Index doesn't exist or needs update
except httpx.HTTPStatusError as e:
if e.response.status_code == 404:
return True # Index doesn't exist, need to create
raise # Re-raise other HTTP errors
async def index_batch(client, entity_ids):
index_response = await client.post(
f"{BASE_URL}/entities/batch-index",
json=entity_ids,
timeout=60,
)
return index_response
@lib_app.command("index") @lib_app.command("index")
def index( def index(
library_id: int, library_id: int,
@ -531,6 +560,8 @@ def index(
else: else:
library_folders = library["folders"] library_folders = library["folders"]
async def process_folders():
async with httpx.AsyncClient(timeout=60) as client:
# Iterate through folders # Iterate through folders
for folder in library_folders: for folder in library_folders:
tqdm.write(f"Processing folder: {folder['id']}") tqdm.write(f"Processing folder: {folder['id']}")
@ -539,12 +570,13 @@ def index(
limit = 200 limit = 200
offset = 0 offset = 0
total_entities = 0 # We'll update this after the first request total_entities = 0 # We'll update this after the first request
with tqdm(total=total_entities, desc="Indexing entities", leave=True) as pbar: with tqdm(
total=total_entities, desc="Indexing entities", leave=True
) as pbar:
while True: while True:
entities_response = httpx.get( entities_response = await client.get(
f"{BASE_URL}/libraries/{library_id}/folders/{folder['id']}/entities", f"{BASE_URL}/libraries/{library_id}/folders/{folder['id']}/entities",
params={"limit": limit, "offset": offset}, params={"limit": limit, "offset": offset},
timeout=60,
) )
if entities_response.status_code != 200: if entities_response.status_code != 200:
pbar.write( pbar.write(
@ -559,40 +591,51 @@ def index(
# Update total if this is the first request # Update total if this is the first request
if offset == 0: if offset == 0:
total_entities = int( total_entities = int(
entities_response.headers.get("X-Total-Count", total_entities) entities_response.headers.get(
"X-Total-Count", total_entities
)
) )
pbar.total = total_entities pbar.total = total_entities
pbar.refresh() pbar.refresh()
# Index each entity # Index each entity
batch_size = 40 batch_size = 8
for i in range(0, len(entities), batch_size): for i in range(0, len(entities), batch_size):
batch = entities[i : i + batch_size] batch = entities[i : i + batch_size]
entity_ids = [entity["id"] for entity in batch] to_index = []
index_response = httpx.post(
f"{BASE_URL}/entities/batch-index", for entity in batch:
json=entity_ids, needs_indexing = await check_and_index_entity(
timeout=60, client, entity["id"], entity["last_scan_at"]
) )
if needs_indexing:
to_index.append(entity["id"])
if to_index:
index_response = await index_batch(client, to_index)
if index_response.status_code == 204: if index_response.status_code == 204:
pbar.write(f"Indexed batch of {len(batch)} entities") pbar.write(
f"Indexed batch of {len(to_index)} entities"
)
else: else:
pbar.write( pbar.write(
f"Failed to index batch: {index_response.status_code} - {index_response.text}" f"Failed to index batch: {index_response.status_code} - {index_response.text}"
) )
scanned_entities.update(str(entity["id"]) for entity in batch) scanned_entities.update(
str(entity["id"]) for entity in batch
)
pbar.update(len(batch)) pbar.update(len(batch))
offset += limit offset += limit
# List all indexed entities in the folder # List all indexed entities in the folder
offset = 0 offset = 0
print(f"Starting cleanup process for folder {folder['id']}")
while True: while True:
index_response = httpx.get( index_response = await client.get(
f"{BASE_URL}/libraries/{library_id}/folders/{folder['id']}/index", f"{BASE_URL}/libraries/{library_id}/folders/{folder['id']}/index",
params={"limit": 200, "offset": offset}, params={"limit": 200, "offset": offset},
timeout=60,
) )
if index_response.status_code != 200: if index_response.status_code != 200:
tqdm.write( tqdm.write(
@ -602,6 +645,7 @@ def index(
indexed_entities = index_response.json() indexed_entities = index_response.json()
if not indexed_entities: if not indexed_entities:
print("No more indexed entities to process")
break break
# Delete indexes for entities not in scanned_entities # Delete indexes for entities not in scanned_entities
@ -609,11 +653,16 @@ def index(
indexed_entities, desc="Cleaning up indexes", leave=False indexed_entities, desc="Cleaning up indexes", leave=False
): ):
if indexed_entity["id"] not in scanned_entities: if indexed_entity["id"] not in scanned_entities:
delete_response = httpx.delete( tqdm.write(
f"Entity {indexed_entity['id']} not in scanned entities, deleting index"
)
delete_response = await client.delete(
f"{BASE_URL}/entities/{indexed_entity['id']}/index" f"{BASE_URL}/entities/{indexed_entity['id']}/index"
) )
if delete_response.status_code == 204: if delete_response.status_code == 204:
tqdm.write(f"Deleted index for entity: {indexed_entity['id']}") tqdm.write(
f"Deleted index for entity: {indexed_entity['id']}"
)
else: else:
tqdm.write( tqdm.write(
f"Failed to delete index for entity {indexed_entity['id']}: {delete_response.status_code} - {delete_response.text}" f"Failed to delete index for entity {indexed_entity['id']}: {delete_response.status_code} - {delete_response.text}"
@ -621,6 +670,9 @@ def index(
offset += 200 offset += 200
print(f"Finished cleanup process for folder {folder['id']}")
asyncio.run(process_folders())
print("Indexing completed") print("Indexing completed")