refactor: use magika for file type detect

This commit is contained in:
arkohut 2024-06-12 20:26:24 +08:00
parent 1dc7341b26
commit 2ffd4b0b78
8 changed files with 36 additions and 15 deletions

View File

@ -1,4 +1,3 @@
import mimetypes
import os
import time
from datetime import datetime, timezone
@ -10,7 +9,7 @@ import typer
from memos.server import run_server
from tabulate import tabulate
from tqdm import tqdm
from gitignore_parser import parse_gitignore
from magika import Magika
app = typer.Typer()
lib_app = typer.Typer()
@ -18,6 +17,8 @@ plugin_app = typer.Typer()
app.add_typer(plugin_app, name="plugin")
app.add_typer(lib_app, name="lib")
file_detector = Magika()
BASE_URL = "http://localhost:8080"
ignore_files = [".DS_Store"]
@ -32,6 +33,11 @@ def format_timestamp(timestamp):
)
def get_file_type(file_path):
file_result = file_detector.identify_path(file_path)
return file_result.output.ct_label, file_result.output.group
def display_libraries(libraries):
table = []
for library in libraries:
@ -130,9 +136,7 @@ def scan(library_id: int):
str(absolute_file_path)
) # Add to scanned files set
file_stat = file_path.stat()
file_type = (
mimetypes.guess_type(file_path)[0] or "application/octet-stream"
)
file_type, file_type_group = get_file_type(absolute_file_path)
new_entity = {
"filename": file_path.name,
"filepath": str(absolute_file_path), # Save absolute path
@ -140,6 +144,7 @@ def scan(library_id: int):
"file_created_at": format_timestamp(file_stat.st_ctime),
"file_last_modified_at": format_timestamp(file_stat.st_mtime),
"file_type": file_type,
"file_type_group": file_type_group,
"folder_id": folder["id"],
}
# Check if the entity already exists

View File

@ -1,5 +1,5 @@
{
"data_type": "attribute",
"data_type": "text",
"entity_id": 1,
"id": 1,
"key": "author",

View File

@ -1,7 +1,8 @@
{
"file_created_at": "2023-01-01T00:00:00",
"file_last_modified_at": "2023-01-01T00:00:00",
"file_type": "text/plain",
"file_type": "txt",
"file_type_group": "text",
"filename": "metadata_test_file.txt",
"filepath": "/tmp/metadata_folder/metadata_test_file.txt",
"folder_id": 1,

View File

@ -58,6 +58,7 @@ class EntityModel(Base):
file_created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False)
file_last_modified_at: Mapped[datetime] = mapped_column(DateTime, nullable=False)
file_type: Mapped[str] = mapped_column(String, nullable=False)
file_type_group: Mapped[str] = mapped_column(String, nullable=False)
last_scan_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
library_id: Mapped[int] = mapped_column(
Integer, ForeignKey("libraries.id"), nullable=False

View File

@ -31,6 +31,7 @@ class NewEntityParam(BaseModel):
file_created_at: datetime
file_last_modified_at: datetime
file_type: str
file_type_group: str
folder_id: int
@ -46,6 +47,7 @@ class UpdateEntityParam(BaseModel):
file_created_at: datetime | None = None
file_last_modified_at: datetime | None = None
file_type: str | None = None
file_type_group: str | None = None
tags: List[str] = []
attrs: List[EntityMetadataParam] = []
@ -128,6 +130,7 @@ class Entity(BaseModel):
file_created_at: datetime
file_last_modified_at: datetime
file_type: str
file_type_group: str
last_scan_at: datetime | None
folder_id: int
library_id: int

View File

@ -60,7 +60,8 @@ def setup_library_with_entity(client):
size=5678,
file_created_at="2023-01-01T00:00:00",
file_last_modified_at="2023-01-01T00:00:00",
file_type="text/plain",
file_type="txt",
file_type_group="text",
folder_id=folder_id,
)
entity_response = client.post(
@ -161,7 +162,8 @@ def test_new_entity(client):
size=150,
file_created_at="2023-01-01T00:00:00",
file_last_modified_at="2023-01-01T00:00:00",
file_type="text/plain",
file_type="txt",
file_type_group="text",
folder_id=folder_id,
)
entity_response = client.post(
@ -178,7 +180,8 @@ def test_new_entity(client):
assert entity_data["size"] == 150
assert entity_data["file_created_at"] == "2023-01-01T00:00:00"
assert entity_data["file_last_modified_at"] == "2023-01-01T00:00:00"
assert entity_data["file_type"] == "text/plain"
assert entity_data["file_type"] == "txt"
assert entity_data["file_type_group"] == "text"
assert entity_data["folder_id"] == 1
# Test for library not found
@ -196,7 +199,8 @@ def test_update_entity(client):
updated_entity = UpdateEntityParam(
size=200,
file_created_at="2023-01-02T00:00:00",
file_type="text/markdown",
file_type="markdown",
file_type_group="text",
)
update_response = client.put(
f"/entities/{entity_id}",
@ -212,7 +216,8 @@ def test_update_entity(client):
assert updated_data["size"] == 200
assert updated_data["file_created_at"] == "2023-01-02T00:00:00"
assert updated_data["file_last_modified_at"] == "2023-01-01T00:00:00"
assert updated_data["file_type"] == "text/markdown"
assert updated_data["file_type"] == "markdown"
assert updated_data["file_type_group"] == "text"
# Test for entity not found
invalid_update_response = client.put(
@ -240,7 +245,8 @@ def test_get_entity_by_filepath(client):
size=100,
file_created_at="2023-01-01T00:00:00",
file_last_modified_at="2023-01-01T00:00:00",
file_type="text/plain",
file_type="txt",
file_type_group="text",
folder_id=1,
)
entity_response = client.post(
@ -263,6 +269,7 @@ def test_get_entity_by_filepath(client):
assert entity_data["filename"] == new_entity.filename
assert entity_data["size"] == new_entity.size
assert entity_data["file_type"] == new_entity.file_type
assert entity_data["file_type_group"] == new_entity.file_type_group
# Test for entity not found
invalid_get_response = client.get(
@ -302,7 +309,8 @@ def test_list_entities_in_folder(client):
size=100,
file_created_at="2023-01-01T00:00:00",
file_last_modified_at="2023-01-01T00:00:00",
file_type="text/plain",
file_type="txt",
file_type_group="text",
folder_id=folder_id,
)
entity_response = client.post(
@ -324,6 +332,7 @@ def test_list_entities_in_folder(client):
assert entities_data[0]["filename"] == new_entity.filename
assert entities_data[0]["size"] == new_entity.size
assert entities_data[0]["file_type"] == new_entity.file_type
assert entities_data[0]["file_type_group"] == new_entity.file_type_group
# Test for folder not found
invalid_list_response = client.get(f"/libraries/{library_id}/folders/9999/entities")

View File

@ -5,3 +5,4 @@ pydantic
sqlalchemy
typer
tabulate
magika

View File

@ -18,7 +18,8 @@ setup(
'httpx',
'pydantic',
'sqlalchemy',
'typer'
'typer',
'magika'
],
entry_points={
'console_scripts': [