refactor: use magika for file type detect

This commit is contained in:
arkohut 2024-06-12 20:26:24 +08:00
parent 1dc7341b26
commit 2ffd4b0b78
8 changed files with 36 additions and 15 deletions

View File

@ -1,4 +1,3 @@
import mimetypes
import os import os
import time import time
from datetime import datetime, timezone from datetime import datetime, timezone
@ -10,7 +9,7 @@ import typer
from memos.server import run_server from memos.server import run_server
from tabulate import tabulate from tabulate import tabulate
from tqdm import tqdm from tqdm import tqdm
from gitignore_parser import parse_gitignore from magika import Magika
app = typer.Typer() app = typer.Typer()
lib_app = typer.Typer() lib_app = typer.Typer()
@ -18,6 +17,8 @@ plugin_app = typer.Typer()
app.add_typer(plugin_app, name="plugin") app.add_typer(plugin_app, name="plugin")
app.add_typer(lib_app, name="lib") app.add_typer(lib_app, name="lib")
file_detector = Magika()
BASE_URL = "http://localhost:8080" BASE_URL = "http://localhost:8080"
ignore_files = [".DS_Store"] ignore_files = [".DS_Store"]
@ -32,6 +33,11 @@ def format_timestamp(timestamp):
) )
def get_file_type(file_path):
file_result = file_detector.identify_path(file_path)
return file_result.output.ct_label, file_result.output.group
def display_libraries(libraries): def display_libraries(libraries):
table = [] table = []
for library in libraries: for library in libraries:
@ -130,9 +136,7 @@ def scan(library_id: int):
str(absolute_file_path) str(absolute_file_path)
) # Add to scanned files set ) # Add to scanned files set
file_stat = file_path.stat() file_stat = file_path.stat()
file_type = ( file_type, file_type_group = get_file_type(absolute_file_path)
mimetypes.guess_type(file_path)[0] or "application/octet-stream"
)
new_entity = { new_entity = {
"filename": file_path.name, "filename": file_path.name,
"filepath": str(absolute_file_path), # Save absolute path "filepath": str(absolute_file_path), # Save absolute path
@ -140,6 +144,7 @@ def scan(library_id: int):
"file_created_at": format_timestamp(file_stat.st_ctime), "file_created_at": format_timestamp(file_stat.st_ctime),
"file_last_modified_at": format_timestamp(file_stat.st_mtime), "file_last_modified_at": format_timestamp(file_stat.st_mtime),
"file_type": file_type, "file_type": file_type,
"file_type_group": file_type_group,
"folder_id": folder["id"], "folder_id": folder["id"],
} }
# Check if the entity already exists # Check if the entity already exists

View File

@ -1,5 +1,5 @@
{ {
"data_type": "attribute", "data_type": "text",
"entity_id": 1, "entity_id": 1,
"id": 1, "id": 1,
"key": "author", "key": "author",

View File

@ -1,7 +1,8 @@
{ {
"file_created_at": "2023-01-01T00:00:00", "file_created_at": "2023-01-01T00:00:00",
"file_last_modified_at": "2023-01-01T00:00:00", "file_last_modified_at": "2023-01-01T00:00:00",
"file_type": "text/plain", "file_type": "txt",
"file_type_group": "text",
"filename": "metadata_test_file.txt", "filename": "metadata_test_file.txt",
"filepath": "/tmp/metadata_folder/metadata_test_file.txt", "filepath": "/tmp/metadata_folder/metadata_test_file.txt",
"folder_id": 1, "folder_id": 1,

View File

@ -58,6 +58,7 @@ class EntityModel(Base):
file_created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) file_created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False)
file_last_modified_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) file_last_modified_at: Mapped[datetime] = mapped_column(DateTime, nullable=False)
file_type: Mapped[str] = mapped_column(String, nullable=False) file_type: Mapped[str] = mapped_column(String, nullable=False)
file_type_group: Mapped[str] = mapped_column(String, nullable=False)
last_scan_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True) last_scan_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
library_id: Mapped[int] = mapped_column( library_id: Mapped[int] = mapped_column(
Integer, ForeignKey("libraries.id"), nullable=False Integer, ForeignKey("libraries.id"), nullable=False

View File

@ -31,6 +31,7 @@ class NewEntityParam(BaseModel):
file_created_at: datetime file_created_at: datetime
file_last_modified_at: datetime file_last_modified_at: datetime
file_type: str file_type: str
file_type_group: str
folder_id: int folder_id: int
@ -46,6 +47,7 @@ class UpdateEntityParam(BaseModel):
file_created_at: datetime | None = None file_created_at: datetime | None = None
file_last_modified_at: datetime | None = None file_last_modified_at: datetime | None = None
file_type: str | None = None file_type: str | None = None
file_type_group: str | None = None
tags: List[str] = [] tags: List[str] = []
attrs: List[EntityMetadataParam] = [] attrs: List[EntityMetadataParam] = []
@ -128,6 +130,7 @@ class Entity(BaseModel):
file_created_at: datetime file_created_at: datetime
file_last_modified_at: datetime file_last_modified_at: datetime
file_type: str file_type: str
file_type_group: str
last_scan_at: datetime | None last_scan_at: datetime | None
folder_id: int folder_id: int
library_id: int library_id: int

View File

@ -60,7 +60,8 @@ def setup_library_with_entity(client):
size=5678, size=5678,
file_created_at="2023-01-01T00:00:00", file_created_at="2023-01-01T00:00:00",
file_last_modified_at="2023-01-01T00:00:00", file_last_modified_at="2023-01-01T00:00:00",
file_type="text/plain", file_type="txt",
file_type_group="text",
folder_id=folder_id, folder_id=folder_id,
) )
entity_response = client.post( entity_response = client.post(
@ -161,7 +162,8 @@ def test_new_entity(client):
size=150, size=150,
file_created_at="2023-01-01T00:00:00", file_created_at="2023-01-01T00:00:00",
file_last_modified_at="2023-01-01T00:00:00", file_last_modified_at="2023-01-01T00:00:00",
file_type="text/plain", file_type="txt",
file_type_group="text",
folder_id=folder_id, folder_id=folder_id,
) )
entity_response = client.post( entity_response = client.post(
@ -178,7 +180,8 @@ def test_new_entity(client):
assert entity_data["size"] == 150 assert entity_data["size"] == 150
assert entity_data["file_created_at"] == "2023-01-01T00:00:00" assert entity_data["file_created_at"] == "2023-01-01T00:00:00"
assert entity_data["file_last_modified_at"] == "2023-01-01T00:00:00" assert entity_data["file_last_modified_at"] == "2023-01-01T00:00:00"
assert entity_data["file_type"] == "text/plain" assert entity_data["file_type"] == "txt"
assert entity_data["file_type_group"] == "text"
assert entity_data["folder_id"] == 1 assert entity_data["folder_id"] == 1
# Test for library not found # Test for library not found
@ -196,7 +199,8 @@ def test_update_entity(client):
updated_entity = UpdateEntityParam( updated_entity = UpdateEntityParam(
size=200, size=200,
file_created_at="2023-01-02T00:00:00", file_created_at="2023-01-02T00:00:00",
file_type="text/markdown", file_type="markdown",
file_type_group="text",
) )
update_response = client.put( update_response = client.put(
f"/entities/{entity_id}", f"/entities/{entity_id}",
@ -212,7 +216,8 @@ def test_update_entity(client):
assert updated_data["size"] == 200 assert updated_data["size"] == 200
assert updated_data["file_created_at"] == "2023-01-02T00:00:00" assert updated_data["file_created_at"] == "2023-01-02T00:00:00"
assert updated_data["file_last_modified_at"] == "2023-01-01T00:00:00" assert updated_data["file_last_modified_at"] == "2023-01-01T00:00:00"
assert updated_data["file_type"] == "text/markdown" assert updated_data["file_type"] == "markdown"
assert updated_data["file_type_group"] == "text"
# Test for entity not found # Test for entity not found
invalid_update_response = client.put( invalid_update_response = client.put(
@ -240,7 +245,8 @@ def test_get_entity_by_filepath(client):
size=100, size=100,
file_created_at="2023-01-01T00:00:00", file_created_at="2023-01-01T00:00:00",
file_last_modified_at="2023-01-01T00:00:00", file_last_modified_at="2023-01-01T00:00:00",
file_type="text/plain", file_type="txt",
file_type_group="text",
folder_id=1, folder_id=1,
) )
entity_response = client.post( entity_response = client.post(
@ -263,6 +269,7 @@ def test_get_entity_by_filepath(client):
assert entity_data["filename"] == new_entity.filename assert entity_data["filename"] == new_entity.filename
assert entity_data["size"] == new_entity.size assert entity_data["size"] == new_entity.size
assert entity_data["file_type"] == new_entity.file_type assert entity_data["file_type"] == new_entity.file_type
assert entity_data["file_type_group"] == new_entity.file_type_group
# Test for entity not found # Test for entity not found
invalid_get_response = client.get( invalid_get_response = client.get(
@ -302,7 +309,8 @@ def test_list_entities_in_folder(client):
size=100, size=100,
file_created_at="2023-01-01T00:00:00", file_created_at="2023-01-01T00:00:00",
file_last_modified_at="2023-01-01T00:00:00", file_last_modified_at="2023-01-01T00:00:00",
file_type="text/plain", file_type="txt",
file_type_group="text",
folder_id=folder_id, folder_id=folder_id,
) )
entity_response = client.post( entity_response = client.post(
@ -324,6 +332,7 @@ def test_list_entities_in_folder(client):
assert entities_data[0]["filename"] == new_entity.filename assert entities_data[0]["filename"] == new_entity.filename
assert entities_data[0]["size"] == new_entity.size assert entities_data[0]["size"] == new_entity.size
assert entities_data[0]["file_type"] == new_entity.file_type assert entities_data[0]["file_type"] == new_entity.file_type
assert entities_data[0]["file_type_group"] == new_entity.file_type_group
# Test for folder not found # Test for folder not found
invalid_list_response = client.get(f"/libraries/{library_id}/folders/9999/entities") invalid_list_response = client.get(f"/libraries/{library_id}/folders/9999/entities")

View File

@ -5,3 +5,4 @@ pydantic
sqlalchemy sqlalchemy
typer typer
tabulate tabulate
magika

View File

@ -18,7 +18,8 @@ setup(
'httpx', 'httpx',
'pydantic', 'pydantic',
'sqlalchemy', 'sqlalchemy',
'typer' 'typer',
'magika'
], ],
entry_points={ entry_points={
'console_scripts': [ 'console_scripts': [