From 2ffd4b0b78fe0020751c8574c426a6d64c317715 Mon Sep 17 00:00:00 2001 From: arkohut <39525455+arkohut@users.noreply.github.com> Date: Wed, 12 Jun 2024 20:26:24 +0800 Subject: [PATCH] refactor: use magika for file type detect --- memos/commands.py | 15 ++++++++---- ...adata_entry_to_entity_sucess_response.json | 2 +- .../patch_entity_metadata_response.json | 3 ++- memos/models.py | 1 + memos/schemas.py | 3 +++ memos/test_server.py | 23 +++++++++++++------ requirements.txt | 1 + setup.py | 3 ++- 8 files changed, 36 insertions(+), 15 deletions(-) diff --git a/memos/commands.py b/memos/commands.py index 6155f51..652cac8 100644 --- a/memos/commands.py +++ b/memos/commands.py @@ -1,4 +1,3 @@ -import mimetypes import os import time from datetime import datetime, timezone @@ -10,7 +9,7 @@ import typer from memos.server import run_server from tabulate import tabulate from tqdm import tqdm -from gitignore_parser import parse_gitignore +from magika import Magika app = typer.Typer() lib_app = typer.Typer() @@ -18,6 +17,8 @@ plugin_app = typer.Typer() app.add_typer(plugin_app, name="plugin") app.add_typer(lib_app, name="lib") +file_detector = Magika() + BASE_URL = "http://localhost:8080" ignore_files = [".DS_Store"] @@ -32,6 +33,11 @@ def format_timestamp(timestamp): ) +def get_file_type(file_path): + file_result = file_detector.identify_path(file_path) + return file_result.output.ct_label, file_result.output.group + + def display_libraries(libraries): table = [] for library in libraries: @@ -130,9 +136,7 @@ def scan(library_id: int): str(absolute_file_path) ) # Add to scanned files set file_stat = file_path.stat() - file_type = ( - mimetypes.guess_type(file_path)[0] or "application/octet-stream" - ) + file_type, file_type_group = get_file_type(absolute_file_path) new_entity = { "filename": file_path.name, "filepath": str(absolute_file_path), # Save absolute path @@ -140,6 +144,7 @@ def scan(library_id: int): "file_created_at": format_timestamp(file_stat.st_ctime), "file_last_modified_at": format_timestamp(file_stat.st_mtime), "file_type": file_type, + "file_type_group": file_type_group, "folder_id": folder["id"], } # Check if the entity already exists diff --git a/memos/fixtures/add_metadata_entry_to_entity_sucess_response.json b/memos/fixtures/add_metadata_entry_to_entity_sucess_response.json index 6bfeab0..b9b76fb 100644 --- a/memos/fixtures/add_metadata_entry_to_entity_sucess_response.json +++ b/memos/fixtures/add_metadata_entry_to_entity_sucess_response.json @@ -1,5 +1,5 @@ { - "data_type": "attribute", + "data_type": "text", "entity_id": 1, "id": 1, "key": "author", diff --git a/memos/fixtures/patch_entity_metadata_response.json b/memos/fixtures/patch_entity_metadata_response.json index 7b55cda..908a32e 100644 --- a/memos/fixtures/patch_entity_metadata_response.json +++ b/memos/fixtures/patch_entity_metadata_response.json @@ -1,7 +1,8 @@ { "file_created_at": "2023-01-01T00:00:00", "file_last_modified_at": "2023-01-01T00:00:00", - "file_type": "text/plain", + "file_type": "txt", + "file_type_group": "text", "filename": "metadata_test_file.txt", "filepath": "/tmp/metadata_folder/metadata_test_file.txt", "folder_id": 1, diff --git a/memos/models.py b/memos/models.py index c106f0e..32b53ae 100644 --- a/memos/models.py +++ b/memos/models.py @@ -58,6 +58,7 @@ class EntityModel(Base): file_created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) file_last_modified_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) file_type: Mapped[str] = mapped_column(String, nullable=False) + file_type_group: Mapped[str] = mapped_column(String, nullable=False) last_scan_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True) library_id: Mapped[int] = mapped_column( Integer, ForeignKey("libraries.id"), nullable=False diff --git a/memos/schemas.py b/memos/schemas.py index 2bc13ff..bd5e8c0 100644 --- a/memos/schemas.py +++ b/memos/schemas.py @@ -31,6 +31,7 @@ class NewEntityParam(BaseModel): file_created_at: datetime file_last_modified_at: datetime file_type: str + file_type_group: str folder_id: int @@ -46,6 +47,7 @@ class UpdateEntityParam(BaseModel): file_created_at: datetime | None = None file_last_modified_at: datetime | None = None file_type: str | None = None + file_type_group: str | None = None tags: List[str] = [] attrs: List[EntityMetadataParam] = [] @@ -128,6 +130,7 @@ class Entity(BaseModel): file_created_at: datetime file_last_modified_at: datetime file_type: str + file_type_group: str last_scan_at: datetime | None folder_id: int library_id: int diff --git a/memos/test_server.py b/memos/test_server.py index a1fc276..f010f71 100644 --- a/memos/test_server.py +++ b/memos/test_server.py @@ -60,7 +60,8 @@ def setup_library_with_entity(client): size=5678, file_created_at="2023-01-01T00:00:00", file_last_modified_at="2023-01-01T00:00:00", - file_type="text/plain", + file_type="txt", + file_type_group="text", folder_id=folder_id, ) entity_response = client.post( @@ -161,7 +162,8 @@ def test_new_entity(client): size=150, file_created_at="2023-01-01T00:00:00", file_last_modified_at="2023-01-01T00:00:00", - file_type="text/plain", + file_type="txt", + file_type_group="text", folder_id=folder_id, ) entity_response = client.post( @@ -178,7 +180,8 @@ def test_new_entity(client): assert entity_data["size"] == 150 assert entity_data["file_created_at"] == "2023-01-01T00:00:00" assert entity_data["file_last_modified_at"] == "2023-01-01T00:00:00" - assert entity_data["file_type"] == "text/plain" + assert entity_data["file_type"] == "txt" + assert entity_data["file_type_group"] == "text" assert entity_data["folder_id"] == 1 # Test for library not found @@ -196,7 +199,8 @@ def test_update_entity(client): updated_entity = UpdateEntityParam( size=200, file_created_at="2023-01-02T00:00:00", - file_type="text/markdown", + file_type="markdown", + file_type_group="text", ) update_response = client.put( f"/entities/{entity_id}", @@ -212,7 +216,8 @@ def test_update_entity(client): assert updated_data["size"] == 200 assert updated_data["file_created_at"] == "2023-01-02T00:00:00" assert updated_data["file_last_modified_at"] == "2023-01-01T00:00:00" - assert updated_data["file_type"] == "text/markdown" + assert updated_data["file_type"] == "markdown" + assert updated_data["file_type_group"] == "text" # Test for entity not found invalid_update_response = client.put( @@ -240,7 +245,8 @@ def test_get_entity_by_filepath(client): size=100, file_created_at="2023-01-01T00:00:00", file_last_modified_at="2023-01-01T00:00:00", - file_type="text/plain", + file_type="txt", + file_type_group="text", folder_id=1, ) entity_response = client.post( @@ -263,6 +269,7 @@ def test_get_entity_by_filepath(client): assert entity_data["filename"] == new_entity.filename assert entity_data["size"] == new_entity.size assert entity_data["file_type"] == new_entity.file_type + assert entity_data["file_type_group"] == new_entity.file_type_group # Test for entity not found invalid_get_response = client.get( @@ -302,7 +309,8 @@ def test_list_entities_in_folder(client): size=100, file_created_at="2023-01-01T00:00:00", file_last_modified_at="2023-01-01T00:00:00", - file_type="text/plain", + file_type="txt", + file_type_group="text", folder_id=folder_id, ) entity_response = client.post( @@ -324,6 +332,7 @@ def test_list_entities_in_folder(client): assert entities_data[0]["filename"] == new_entity.filename assert entities_data[0]["size"] == new_entity.size assert entities_data[0]["file_type"] == new_entity.file_type + assert entities_data[0]["file_type_group"] == new_entity.file_type_group # Test for folder not found invalid_list_response = client.get(f"/libraries/{library_id}/folders/9999/entities") diff --git a/requirements.txt b/requirements.txt index 52c3046..0e77033 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ pydantic sqlalchemy typer tabulate +magika diff --git a/setup.py b/setup.py index 6829e65..4eaa08f 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,8 @@ setup( 'httpx', 'pydantic', 'sqlalchemy', - 'typer' + 'typer', + 'magika' ], entry_points={ 'console_scripts': [