mirror of
https://github.com/tcsenpai/pensieve.git
synced 2025-06-06 03:05:25 +00:00
refactor: use magika for file type detect
This commit is contained in:
parent
1dc7341b26
commit
2ffd4b0b78
@ -1,4 +1,3 @@
|
||||
import mimetypes
|
||||
import os
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
@ -10,7 +9,7 @@ import typer
|
||||
from memos.server import run_server
|
||||
from tabulate import tabulate
|
||||
from tqdm import tqdm
|
||||
from gitignore_parser import parse_gitignore
|
||||
from magika import Magika
|
||||
|
||||
app = typer.Typer()
|
||||
lib_app = typer.Typer()
|
||||
@ -18,6 +17,8 @@ plugin_app = typer.Typer()
|
||||
app.add_typer(plugin_app, name="plugin")
|
||||
app.add_typer(lib_app, name="lib")
|
||||
|
||||
file_detector = Magika()
|
||||
|
||||
BASE_URL = "http://localhost:8080"
|
||||
|
||||
ignore_files = [".DS_Store"]
|
||||
@ -32,6 +33,11 @@ def format_timestamp(timestamp):
|
||||
)
|
||||
|
||||
|
||||
def get_file_type(file_path):
|
||||
file_result = file_detector.identify_path(file_path)
|
||||
return file_result.output.ct_label, file_result.output.group
|
||||
|
||||
|
||||
def display_libraries(libraries):
|
||||
table = []
|
||||
for library in libraries:
|
||||
@ -130,9 +136,7 @@ def scan(library_id: int):
|
||||
str(absolute_file_path)
|
||||
) # Add to scanned files set
|
||||
file_stat = file_path.stat()
|
||||
file_type = (
|
||||
mimetypes.guess_type(file_path)[0] or "application/octet-stream"
|
||||
)
|
||||
file_type, file_type_group = get_file_type(absolute_file_path)
|
||||
new_entity = {
|
||||
"filename": file_path.name,
|
||||
"filepath": str(absolute_file_path), # Save absolute path
|
||||
@ -140,6 +144,7 @@ def scan(library_id: int):
|
||||
"file_created_at": format_timestamp(file_stat.st_ctime),
|
||||
"file_last_modified_at": format_timestamp(file_stat.st_mtime),
|
||||
"file_type": file_type,
|
||||
"file_type_group": file_type_group,
|
||||
"folder_id": folder["id"],
|
||||
}
|
||||
# Check if the entity already exists
|
||||
|
@ -1,5 +1,5 @@
|
||||
{
|
||||
"data_type": "attribute",
|
||||
"data_type": "text",
|
||||
"entity_id": 1,
|
||||
"id": 1,
|
||||
"key": "author",
|
||||
|
@ -1,7 +1,8 @@
|
||||
{
|
||||
"file_created_at": "2023-01-01T00:00:00",
|
||||
"file_last_modified_at": "2023-01-01T00:00:00",
|
||||
"file_type": "text/plain",
|
||||
"file_type": "txt",
|
||||
"file_type_group": "text",
|
||||
"filename": "metadata_test_file.txt",
|
||||
"filepath": "/tmp/metadata_folder/metadata_test_file.txt",
|
||||
"folder_id": 1,
|
||||
|
@ -58,6 +58,7 @@ class EntityModel(Base):
|
||||
file_created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False)
|
||||
file_last_modified_at: Mapped[datetime] = mapped_column(DateTime, nullable=False)
|
||||
file_type: Mapped[str] = mapped_column(String, nullable=False)
|
||||
file_type_group: Mapped[str] = mapped_column(String, nullable=False)
|
||||
last_scan_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
|
||||
library_id: Mapped[int] = mapped_column(
|
||||
Integer, ForeignKey("libraries.id"), nullable=False
|
||||
|
@ -31,6 +31,7 @@ class NewEntityParam(BaseModel):
|
||||
file_created_at: datetime
|
||||
file_last_modified_at: datetime
|
||||
file_type: str
|
||||
file_type_group: str
|
||||
folder_id: int
|
||||
|
||||
|
||||
@ -46,6 +47,7 @@ class UpdateEntityParam(BaseModel):
|
||||
file_created_at: datetime | None = None
|
||||
file_last_modified_at: datetime | None = None
|
||||
file_type: str | None = None
|
||||
file_type_group: str | None = None
|
||||
tags: List[str] = []
|
||||
attrs: List[EntityMetadataParam] = []
|
||||
|
||||
@ -128,6 +130,7 @@ class Entity(BaseModel):
|
||||
file_created_at: datetime
|
||||
file_last_modified_at: datetime
|
||||
file_type: str
|
||||
file_type_group: str
|
||||
last_scan_at: datetime | None
|
||||
folder_id: int
|
||||
library_id: int
|
||||
|
@ -60,7 +60,8 @@ def setup_library_with_entity(client):
|
||||
size=5678,
|
||||
file_created_at="2023-01-01T00:00:00",
|
||||
file_last_modified_at="2023-01-01T00:00:00",
|
||||
file_type="text/plain",
|
||||
file_type="txt",
|
||||
file_type_group="text",
|
||||
folder_id=folder_id,
|
||||
)
|
||||
entity_response = client.post(
|
||||
@ -161,7 +162,8 @@ def test_new_entity(client):
|
||||
size=150,
|
||||
file_created_at="2023-01-01T00:00:00",
|
||||
file_last_modified_at="2023-01-01T00:00:00",
|
||||
file_type="text/plain",
|
||||
file_type="txt",
|
||||
file_type_group="text",
|
||||
folder_id=folder_id,
|
||||
)
|
||||
entity_response = client.post(
|
||||
@ -178,7 +180,8 @@ def test_new_entity(client):
|
||||
assert entity_data["size"] == 150
|
||||
assert entity_data["file_created_at"] == "2023-01-01T00:00:00"
|
||||
assert entity_data["file_last_modified_at"] == "2023-01-01T00:00:00"
|
||||
assert entity_data["file_type"] == "text/plain"
|
||||
assert entity_data["file_type"] == "txt"
|
||||
assert entity_data["file_type_group"] == "text"
|
||||
assert entity_data["folder_id"] == 1
|
||||
|
||||
# Test for library not found
|
||||
@ -196,7 +199,8 @@ def test_update_entity(client):
|
||||
updated_entity = UpdateEntityParam(
|
||||
size=200,
|
||||
file_created_at="2023-01-02T00:00:00",
|
||||
file_type="text/markdown",
|
||||
file_type="markdown",
|
||||
file_type_group="text",
|
||||
)
|
||||
update_response = client.put(
|
||||
f"/entities/{entity_id}",
|
||||
@ -212,7 +216,8 @@ def test_update_entity(client):
|
||||
assert updated_data["size"] == 200
|
||||
assert updated_data["file_created_at"] == "2023-01-02T00:00:00"
|
||||
assert updated_data["file_last_modified_at"] == "2023-01-01T00:00:00"
|
||||
assert updated_data["file_type"] == "text/markdown"
|
||||
assert updated_data["file_type"] == "markdown"
|
||||
assert updated_data["file_type_group"] == "text"
|
||||
|
||||
# Test for entity not found
|
||||
invalid_update_response = client.put(
|
||||
@ -240,7 +245,8 @@ def test_get_entity_by_filepath(client):
|
||||
size=100,
|
||||
file_created_at="2023-01-01T00:00:00",
|
||||
file_last_modified_at="2023-01-01T00:00:00",
|
||||
file_type="text/plain",
|
||||
file_type="txt",
|
||||
file_type_group="text",
|
||||
folder_id=1,
|
||||
)
|
||||
entity_response = client.post(
|
||||
@ -263,6 +269,7 @@ def test_get_entity_by_filepath(client):
|
||||
assert entity_data["filename"] == new_entity.filename
|
||||
assert entity_data["size"] == new_entity.size
|
||||
assert entity_data["file_type"] == new_entity.file_type
|
||||
assert entity_data["file_type_group"] == new_entity.file_type_group
|
||||
|
||||
# Test for entity not found
|
||||
invalid_get_response = client.get(
|
||||
@ -302,7 +309,8 @@ def test_list_entities_in_folder(client):
|
||||
size=100,
|
||||
file_created_at="2023-01-01T00:00:00",
|
||||
file_last_modified_at="2023-01-01T00:00:00",
|
||||
file_type="text/plain",
|
||||
file_type="txt",
|
||||
file_type_group="text",
|
||||
folder_id=folder_id,
|
||||
)
|
||||
entity_response = client.post(
|
||||
@ -324,6 +332,7 @@ def test_list_entities_in_folder(client):
|
||||
assert entities_data[0]["filename"] == new_entity.filename
|
||||
assert entities_data[0]["size"] == new_entity.size
|
||||
assert entities_data[0]["file_type"] == new_entity.file_type
|
||||
assert entities_data[0]["file_type_group"] == new_entity.file_type_group
|
||||
|
||||
# Test for folder not found
|
||||
invalid_list_response = client.get(f"/libraries/{library_id}/folders/9999/entities")
|
||||
|
@ -5,3 +5,4 @@ pydantic
|
||||
sqlalchemy
|
||||
typer
|
||||
tabulate
|
||||
magika
|
||||
|
Loading…
x
Reference in New Issue
Block a user