feat(plugins): add ocr as build plugin

2025-06-09 12:37:12 +00:00 · 2024-08-25 16:48:28 +08:00 · 2024-08-25 16:48:28 +08:00 · 67a5e10d3e
commit 67a5e10d3e
parent ec7ba1f989
14 changed files with 134 additions and 33 deletions
--- a/memos/config.py
+++ b/memos/config.py
@ -7,13 +7,18 @@ class VLMSettings(BaseModel):
    enabled: bool = False
    modelname: str = "internvl-1.5"
    endpoint: str = "http://localhost:11434"
 class OCRSettings(BaseModel):
    enabled: bool = True
    endpoint: str = "http://localhost:5555/predict"
    token: str = ""
-    concurrency: int = 8
+    concurrency: int = 4
 class Settings(BaseSettings):
    model_config = SettingsConfigDict(
        yaml_file=str(Path.home() / ".memos" / "config.yaml"),
-        yaml_file_encoding="utf-8"
+        yaml_file_encoding="utf-8",
    )
    base_dir: str = str(Path.home() / ".memos")
@ -28,6 +33,10 @@ class Settings(BaseSettings):
    # VLM plugin settings
    vlm: VLMSettings = VLMSettings()
    # OCR plugin settings
    ocr: OCRSettings = OCRSettings()
 settings = Settings()
 # Define the default database path
@ -38,4 +47,4 @@ TYPESENSE_COLLECTION_NAME = settings.typesense_collection_name
 # Function to get the database path from environment variable or default
 def get_database_path():
-    return settings.database_path
+    return settings.database_path
--- a/memos/models.py
+++ b/memos/models.py
@ -148,6 +148,25 @@ class LibraryPluginModel(Base):
    )
 def initialize_default_plugins(session):
    default_plugins = [
        PluginModel(name="buildin_vlm", description="VLM Plugin", webhook_url="/plugins/vlm"),
        PluginModel(name="buildin_ocr", description="OCR Plugin", webhook_url="/plugins/ocr"),
    ]
    for plugin in default_plugins:
        existing_plugin = session.query(PluginModel).filter_by(name=plugin.name).first()
        if not existing_plugin:
            session.add(plugin)
    session.commit()
 # Create the database engine with the path from config
 engine = create_engine(f"sqlite:///{get_database_path()}")
 Base.metadata.create_all(engine)
 # Initialize default plugins
 from sqlalchemy.orm import sessionmaker
 Session = sessionmaker(bind=engine)
 with Session() as session:
    initialize_default_plugins(session)
--- a/memos/plugins/ocr/README.md
+++ b/memos/plugins/ocr/README.md
@ -0,0 +1,49 @@
 # OCR Plugin
 This is a README file for the OCR plugin. This plugin uses the `RapidOCR` library to perform OCR (Optical Character Recognition) on image files and updates the metadata of the entity with the OCR results.
 ## How to Run
 To run this OCR plugin, follow the steps below:
 1. **Install the required dependencies:**
   ```bash
   pip install -r requirements.txt
   ```
 2. **Run the FastAPI application:**
   You can run the FastAPI application using `uvicorn`. Make sure you are in the directory where `main.py` is located.
   ```bash
   uvicorn main:app --host 0.0.0.0 --port 8000
   ```
 3. **Integration with memos:**
   ```sh
   $ python -m memos.commands plugin create ocr http://localhost:8000
   Plugin created successfully
   ```
   ```sh
   $ python -m memos.commands plugin ls
   ID  Name    Description    Webhook URL
    1  ocr                    http://localhost:8000/
   ```
   ```sh
   $ python -m memos.commands plugin bind --lib 1 --plugin 1
   Plugin bound to library successfully
   ```
 ## Endpoints
 - `GET /`: Health check endpoint. Returns `{"healthy": True}` if the service is running.
 - `POST /`: OCR endpoint. Accepts an `Entity` object and a `Location` header. Performs OCR on the image file and updates the entity's metadata with the OCR results.
 ## Metadata
 The OCR results are stored in the metadata field named `ocr_result` with the following structure:
--- a/memos/plugins/ocr/init.py
+++ b/memos/plugins/ocr/init.py
--- a/memos/plugins/ocr/fonts/simfang.ttf
+++ b/memos/plugins/ocr/fonts/simfang.ttf
--- a/memos/plugins/ocr/main.py
+++ b/memos/plugins/ocr/main.py
@ -8,17 +8,20 @@ import io
 import os
 from PIL import Image
-from fastapi import FastAPI, Request, HTTPException
+from fastapi import APIRouter, FastAPI, Request, HTTPException
 from memos.schemas import Entity, MetadataType
 METADATA_FIELD_NAME = "ocr_result"
 PLUGIN_NAME = "ocr"
-app = FastAPI()
+router = APIRouter(
-
+    tags=[PLUGIN_NAME],
    responses={404: {"description": "Not found"}}
 )
 endpoint = None
 token = None
-semaphore = asyncio.Semaphore(4)
+concurrency = None
 semaphore = None
 # Configure logger
 logging.basicConfig(level=logging.INFO)
@ -50,6 +53,7 @@ async def fetch(endpoint: str, client, image_base64, headers: Optional[dict] = N
        return response.json()
 # Modify the predict function to use semaphore
 async def predict(img_path):
    image_base64 = image2base64(img_path)
    if not image_base64:
@ -59,19 +63,18 @@ async def predict(img_path):
        headers = {}
        if token:
            headers["Authorization"] = f"Bearer {token}"
-        ocr_result = await fetch(endpoint, client, image_base64, headers=headers)
+        async with semaphore:
            ocr_result = await fetch(endpoint, client, image_base64, headers=headers)
        return ocr_result
-app = FastAPI()
+@router.get("/")
@app.get("/")
 async def read_root():
    return {"healthy": True}
-@app.post("/")
+@router.post("", include_in_schema=False)
@router.post("/")
 async def ocr(entity: Entity, request: Request):
    if not entity.file_type_group == "image":
        return {METADATA_FIELD_NAME: "{}"}
@ -123,27 +126,45 @@ async def ocr(entity: Entity, request: Request):
    }
 def init_plugin(config):
    global endpoint, token, concurrency, semaphore
    endpoint = config.endpoint
    token = config.token
    concurrency = config.concurrency
    semaphore = asyncio.Semaphore(concurrency)
    print(f"Endpoint: {endpoint}")
    print(f"Token: {token}")
    print(f"Concurrency: {concurrency}")
 if __name__ == "__main__":
    import uvicorn
    import argparse
    from fastapi import FastAPI
    parser = argparse.ArgumentParser(description="OCR Plugin")
    parser.add_argument(
        "--endpoint",
        type=str,
-        required=True,
+        default="http://localhost:8080",
        help="The endpoint URL for the OCR service",
    )
    parser.add_argument(
-        "--token", type=str, required=False, help="The token for authentication"
+        "--token", type=str, default="", help="The token for authentication"
    )
    parser.add_argument(
        "--concurrency", type=int, default=4, help="The concurrency level"
    )
    parser.add_argument(
        "--port", type=int, default=8000, help="The port number to run the server on"
    )
    args = parser.parse_args()
    endpoint = args.endpoint
    token = args.token
    port = args.port
-    uvicorn.run(app, host="0.0.0.0", port=port)
+    init_plugin(args)
    app = FastAPI()
    app.include_router(router)
    uvicorn.run(app, host="0.0.0.0", port=args.port)
--- a/memos/plugins/ocr/models/ch_PP-OCRv4_det_infer.onnx
+++ b/memos/plugins/ocr/models/ch_PP-OCRv4_det_infer.onnx
--- a/memos/plugins/ocr/models/ch_PP-OCRv4_rec_infer.onnx
+++ b/memos/plugins/ocr/models/ch_PP-OCRv4_rec_infer.onnx
--- a/memos/plugins/ocr/models/ch_ppocr_mobile_v2.0_cls_train.onnx
+++ b/memos/plugins/ocr/models/ch_ppocr_mobile_v2.0_cls_train.onnx
--- a/memos/plugins/ocr/ppocr-gpu.yaml
+++ b/memos/plugins/ocr/ppocr-gpu.yaml
--- a/memos/plugins/ocr/ppocr.yaml
+++ b/memos/plugins/ocr/ppocr.yaml
--- a/memos/plugins/ocr/requirements.txt
+++ b/memos/plugins/ocr/requirements.txt
--- a/memos/plugins/ocr/server.py
+++ b/memos/plugins/ocr/server.py
--- a/memos/server.py
+++ b/memos/server.py
@ -88,6 +88,18 @@ app.mount(
    "/_app", StaticFiles(directory=os.path.join(current_dir, "static/_app"), html=True)
 )
 # Add VLM plugin router
 if settings.vlm.enabled:
    print("VLM plugin is enabled")
    vlm_main.init_plugin(settings.vlm)
    app.include_router(vlm_main.router, prefix="/plugins/vlm")
 # Add OCR plugin router
 if settings.ocr.enabled:
    print("OCR plugin is enabled")
    ocr_main.init_plugin(settings.ocr)
    app.include_router(ocr_main.router, prefix="/plugins/ocr")
@app.get("/favicon.png", response_class=FileResponse)
 async def favicon_png():
@ -178,8 +190,12 @@ async def trigger_webhooks(
                    location = str(
                        request.url_for("get_entity_by_id", entity_id=entity.id)
                    )
                    webhook_url = plugin.webhook_url
                    if webhook_url.startswith("/"):
                        webhook_url = str(request.base_url)[:-1] + webhook_url
                        print(f"webhook_url: {webhook_url}")
                    task = client.post(
-                        plugin.webhook_url,
+                        webhook_url,
                        json=entity.model_dump(mode="json"),
                        headers={"Location": location},
                        timeout=60.0,
@ -683,19 +699,6 @@ async def get_file(file_path: str):
        raise HTTPException(status_code=404, detail="File not found")
 # Add VLM plugin router
 if settings.vlm.enabled:
    print("VLM plugin is enabled")
    vlm_main.init_plugin(settings.vlm)
    app.include_router(vlm_main.router, prefix=f"/plugins/{vlm_main.PLUGIN_NAME}")
 # Add OCR plugin router
 if settings.ocr.enabled:
    print("OCR plugin is enabled")
    ocr_main.init_plugin(settings.ocr)
    app.include_router(ocr_main.router, prefix=f"/plugins/{ocr_main.PLUGIN_NAME}")
 def run_server():
    print("Database path:", get_database_path())
    print(