mirror of
https://github.com/tcsenpai/pensieve.git
synced 2025-06-06 19:25:24 +00:00
refactor: use magika for file type detect
This commit is contained in:
parent
1dc7341b26
commit
2ffd4b0b78
@ -1,4 +1,3 @@
|
|||||||
import mimetypes
|
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
@ -10,7 +9,7 @@ import typer
|
|||||||
from memos.server import run_server
|
from memos.server import run_server
|
||||||
from tabulate import tabulate
|
from tabulate import tabulate
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
from gitignore_parser import parse_gitignore
|
from magika import Magika
|
||||||
|
|
||||||
app = typer.Typer()
|
app = typer.Typer()
|
||||||
lib_app = typer.Typer()
|
lib_app = typer.Typer()
|
||||||
@ -18,6 +17,8 @@ plugin_app = typer.Typer()
|
|||||||
app.add_typer(plugin_app, name="plugin")
|
app.add_typer(plugin_app, name="plugin")
|
||||||
app.add_typer(lib_app, name="lib")
|
app.add_typer(lib_app, name="lib")
|
||||||
|
|
||||||
|
file_detector = Magika()
|
||||||
|
|
||||||
BASE_URL = "http://localhost:8080"
|
BASE_URL = "http://localhost:8080"
|
||||||
|
|
||||||
ignore_files = [".DS_Store"]
|
ignore_files = [".DS_Store"]
|
||||||
@ -32,6 +33,11 @@ def format_timestamp(timestamp):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_file_type(file_path):
|
||||||
|
file_result = file_detector.identify_path(file_path)
|
||||||
|
return file_result.output.ct_label, file_result.output.group
|
||||||
|
|
||||||
|
|
||||||
def display_libraries(libraries):
|
def display_libraries(libraries):
|
||||||
table = []
|
table = []
|
||||||
for library in libraries:
|
for library in libraries:
|
||||||
@ -130,9 +136,7 @@ def scan(library_id: int):
|
|||||||
str(absolute_file_path)
|
str(absolute_file_path)
|
||||||
) # Add to scanned files set
|
) # Add to scanned files set
|
||||||
file_stat = file_path.stat()
|
file_stat = file_path.stat()
|
||||||
file_type = (
|
file_type, file_type_group = get_file_type(absolute_file_path)
|
||||||
mimetypes.guess_type(file_path)[0] or "application/octet-stream"
|
|
||||||
)
|
|
||||||
new_entity = {
|
new_entity = {
|
||||||
"filename": file_path.name,
|
"filename": file_path.name,
|
||||||
"filepath": str(absolute_file_path), # Save absolute path
|
"filepath": str(absolute_file_path), # Save absolute path
|
||||||
@ -140,6 +144,7 @@ def scan(library_id: int):
|
|||||||
"file_created_at": format_timestamp(file_stat.st_ctime),
|
"file_created_at": format_timestamp(file_stat.st_ctime),
|
||||||
"file_last_modified_at": format_timestamp(file_stat.st_mtime),
|
"file_last_modified_at": format_timestamp(file_stat.st_mtime),
|
||||||
"file_type": file_type,
|
"file_type": file_type,
|
||||||
|
"file_type_group": file_type_group,
|
||||||
"folder_id": folder["id"],
|
"folder_id": folder["id"],
|
||||||
}
|
}
|
||||||
# Check if the entity already exists
|
# Check if the entity already exists
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
{
|
{
|
||||||
"data_type": "attribute",
|
"data_type": "text",
|
||||||
"entity_id": 1,
|
"entity_id": 1,
|
||||||
"id": 1,
|
"id": 1,
|
||||||
"key": "author",
|
"key": "author",
|
||||||
|
@ -1,7 +1,8 @@
|
|||||||
{
|
{
|
||||||
"file_created_at": "2023-01-01T00:00:00",
|
"file_created_at": "2023-01-01T00:00:00",
|
||||||
"file_last_modified_at": "2023-01-01T00:00:00",
|
"file_last_modified_at": "2023-01-01T00:00:00",
|
||||||
"file_type": "text/plain",
|
"file_type": "txt",
|
||||||
|
"file_type_group": "text",
|
||||||
"filename": "metadata_test_file.txt",
|
"filename": "metadata_test_file.txt",
|
||||||
"filepath": "/tmp/metadata_folder/metadata_test_file.txt",
|
"filepath": "/tmp/metadata_folder/metadata_test_file.txt",
|
||||||
"folder_id": 1,
|
"folder_id": 1,
|
||||||
|
@ -58,6 +58,7 @@ class EntityModel(Base):
|
|||||||
file_created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False)
|
file_created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False)
|
||||||
file_last_modified_at: Mapped[datetime] = mapped_column(DateTime, nullable=False)
|
file_last_modified_at: Mapped[datetime] = mapped_column(DateTime, nullable=False)
|
||||||
file_type: Mapped[str] = mapped_column(String, nullable=False)
|
file_type: Mapped[str] = mapped_column(String, nullable=False)
|
||||||
|
file_type_group: Mapped[str] = mapped_column(String, nullable=False)
|
||||||
last_scan_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
|
last_scan_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
|
||||||
library_id: Mapped[int] = mapped_column(
|
library_id: Mapped[int] = mapped_column(
|
||||||
Integer, ForeignKey("libraries.id"), nullable=False
|
Integer, ForeignKey("libraries.id"), nullable=False
|
||||||
|
@ -31,6 +31,7 @@ class NewEntityParam(BaseModel):
|
|||||||
file_created_at: datetime
|
file_created_at: datetime
|
||||||
file_last_modified_at: datetime
|
file_last_modified_at: datetime
|
||||||
file_type: str
|
file_type: str
|
||||||
|
file_type_group: str
|
||||||
folder_id: int
|
folder_id: int
|
||||||
|
|
||||||
|
|
||||||
@ -46,6 +47,7 @@ class UpdateEntityParam(BaseModel):
|
|||||||
file_created_at: datetime | None = None
|
file_created_at: datetime | None = None
|
||||||
file_last_modified_at: datetime | None = None
|
file_last_modified_at: datetime | None = None
|
||||||
file_type: str | None = None
|
file_type: str | None = None
|
||||||
|
file_type_group: str | None = None
|
||||||
tags: List[str] = []
|
tags: List[str] = []
|
||||||
attrs: List[EntityMetadataParam] = []
|
attrs: List[EntityMetadataParam] = []
|
||||||
|
|
||||||
@ -128,6 +130,7 @@ class Entity(BaseModel):
|
|||||||
file_created_at: datetime
|
file_created_at: datetime
|
||||||
file_last_modified_at: datetime
|
file_last_modified_at: datetime
|
||||||
file_type: str
|
file_type: str
|
||||||
|
file_type_group: str
|
||||||
last_scan_at: datetime | None
|
last_scan_at: datetime | None
|
||||||
folder_id: int
|
folder_id: int
|
||||||
library_id: int
|
library_id: int
|
||||||
|
@ -60,7 +60,8 @@ def setup_library_with_entity(client):
|
|||||||
size=5678,
|
size=5678,
|
||||||
file_created_at="2023-01-01T00:00:00",
|
file_created_at="2023-01-01T00:00:00",
|
||||||
file_last_modified_at="2023-01-01T00:00:00",
|
file_last_modified_at="2023-01-01T00:00:00",
|
||||||
file_type="text/plain",
|
file_type="txt",
|
||||||
|
file_type_group="text",
|
||||||
folder_id=folder_id,
|
folder_id=folder_id,
|
||||||
)
|
)
|
||||||
entity_response = client.post(
|
entity_response = client.post(
|
||||||
@ -161,7 +162,8 @@ def test_new_entity(client):
|
|||||||
size=150,
|
size=150,
|
||||||
file_created_at="2023-01-01T00:00:00",
|
file_created_at="2023-01-01T00:00:00",
|
||||||
file_last_modified_at="2023-01-01T00:00:00",
|
file_last_modified_at="2023-01-01T00:00:00",
|
||||||
file_type="text/plain",
|
file_type="txt",
|
||||||
|
file_type_group="text",
|
||||||
folder_id=folder_id,
|
folder_id=folder_id,
|
||||||
)
|
)
|
||||||
entity_response = client.post(
|
entity_response = client.post(
|
||||||
@ -178,7 +180,8 @@ def test_new_entity(client):
|
|||||||
assert entity_data["size"] == 150
|
assert entity_data["size"] == 150
|
||||||
assert entity_data["file_created_at"] == "2023-01-01T00:00:00"
|
assert entity_data["file_created_at"] == "2023-01-01T00:00:00"
|
||||||
assert entity_data["file_last_modified_at"] == "2023-01-01T00:00:00"
|
assert entity_data["file_last_modified_at"] == "2023-01-01T00:00:00"
|
||||||
assert entity_data["file_type"] == "text/plain"
|
assert entity_data["file_type"] == "txt"
|
||||||
|
assert entity_data["file_type_group"] == "text"
|
||||||
assert entity_data["folder_id"] == 1
|
assert entity_data["folder_id"] == 1
|
||||||
|
|
||||||
# Test for library not found
|
# Test for library not found
|
||||||
@ -196,7 +199,8 @@ def test_update_entity(client):
|
|||||||
updated_entity = UpdateEntityParam(
|
updated_entity = UpdateEntityParam(
|
||||||
size=200,
|
size=200,
|
||||||
file_created_at="2023-01-02T00:00:00",
|
file_created_at="2023-01-02T00:00:00",
|
||||||
file_type="text/markdown",
|
file_type="markdown",
|
||||||
|
file_type_group="text",
|
||||||
)
|
)
|
||||||
update_response = client.put(
|
update_response = client.put(
|
||||||
f"/entities/{entity_id}",
|
f"/entities/{entity_id}",
|
||||||
@ -212,7 +216,8 @@ def test_update_entity(client):
|
|||||||
assert updated_data["size"] == 200
|
assert updated_data["size"] == 200
|
||||||
assert updated_data["file_created_at"] == "2023-01-02T00:00:00"
|
assert updated_data["file_created_at"] == "2023-01-02T00:00:00"
|
||||||
assert updated_data["file_last_modified_at"] == "2023-01-01T00:00:00"
|
assert updated_data["file_last_modified_at"] == "2023-01-01T00:00:00"
|
||||||
assert updated_data["file_type"] == "text/markdown"
|
assert updated_data["file_type"] == "markdown"
|
||||||
|
assert updated_data["file_type_group"] == "text"
|
||||||
|
|
||||||
# Test for entity not found
|
# Test for entity not found
|
||||||
invalid_update_response = client.put(
|
invalid_update_response = client.put(
|
||||||
@ -240,7 +245,8 @@ def test_get_entity_by_filepath(client):
|
|||||||
size=100,
|
size=100,
|
||||||
file_created_at="2023-01-01T00:00:00",
|
file_created_at="2023-01-01T00:00:00",
|
||||||
file_last_modified_at="2023-01-01T00:00:00",
|
file_last_modified_at="2023-01-01T00:00:00",
|
||||||
file_type="text/plain",
|
file_type="txt",
|
||||||
|
file_type_group="text",
|
||||||
folder_id=1,
|
folder_id=1,
|
||||||
)
|
)
|
||||||
entity_response = client.post(
|
entity_response = client.post(
|
||||||
@ -263,6 +269,7 @@ def test_get_entity_by_filepath(client):
|
|||||||
assert entity_data["filename"] == new_entity.filename
|
assert entity_data["filename"] == new_entity.filename
|
||||||
assert entity_data["size"] == new_entity.size
|
assert entity_data["size"] == new_entity.size
|
||||||
assert entity_data["file_type"] == new_entity.file_type
|
assert entity_data["file_type"] == new_entity.file_type
|
||||||
|
assert entity_data["file_type_group"] == new_entity.file_type_group
|
||||||
|
|
||||||
# Test for entity not found
|
# Test for entity not found
|
||||||
invalid_get_response = client.get(
|
invalid_get_response = client.get(
|
||||||
@ -302,7 +309,8 @@ def test_list_entities_in_folder(client):
|
|||||||
size=100,
|
size=100,
|
||||||
file_created_at="2023-01-01T00:00:00",
|
file_created_at="2023-01-01T00:00:00",
|
||||||
file_last_modified_at="2023-01-01T00:00:00",
|
file_last_modified_at="2023-01-01T00:00:00",
|
||||||
file_type="text/plain",
|
file_type="txt",
|
||||||
|
file_type_group="text",
|
||||||
folder_id=folder_id,
|
folder_id=folder_id,
|
||||||
)
|
)
|
||||||
entity_response = client.post(
|
entity_response = client.post(
|
||||||
@ -324,6 +332,7 @@ def test_list_entities_in_folder(client):
|
|||||||
assert entities_data[0]["filename"] == new_entity.filename
|
assert entities_data[0]["filename"] == new_entity.filename
|
||||||
assert entities_data[0]["size"] == new_entity.size
|
assert entities_data[0]["size"] == new_entity.size
|
||||||
assert entities_data[0]["file_type"] == new_entity.file_type
|
assert entities_data[0]["file_type"] == new_entity.file_type
|
||||||
|
assert entities_data[0]["file_type_group"] == new_entity.file_type_group
|
||||||
|
|
||||||
# Test for folder not found
|
# Test for folder not found
|
||||||
invalid_list_response = client.get(f"/libraries/{library_id}/folders/9999/entities")
|
invalid_list_response = client.get(f"/libraries/{library_id}/folders/9999/entities")
|
||||||
|
@ -5,3 +5,4 @@ pydantic
|
|||||||
sqlalchemy
|
sqlalchemy
|
||||||
typer
|
typer
|
||||||
tabulate
|
tabulate
|
||||||
|
magika
|
||||||
|
Loading…
x
Reference in New Issue
Block a user