Revert "feat(ocr): cleanup ocr server related content"

This reverts commit d0f6b33554f47cb7b9920ed4a2ea967a50373463.
2025-06-10 13:07:15 +00:00 · 2024-10-04 14:47:54 +08:00 · 2024-10-04 14:47:54 +08:00 · a3240fdde9
commit a3240fdde9
parent b39d651b0c
10 changed files with 345 additions and 0 deletions
--- a/memos/plugins/ocr/README.md
+++ b/memos/plugins/ocr/README.md
@ -0,0 +1,49 @@
 # OCR Plugin
 This is a README file for the OCR plugin. This plugin uses the `RapidOCR` library to perform OCR (Optical Character Recognition) on image files and updates the metadata of the entity with the OCR results.
 ## How to Run
 To run this OCR plugin, follow the steps below:
 1. **Install the required dependencies:**
   ```bash
   pip install -r requirements.txt
   ```
 2. **Run the FastAPI application:**
   You can run the FastAPI application using `uvicorn`. Make sure you are in the directory where `main.py` is located.
   ```bash
   uvicorn main:app --host 0.0.0.0 --port 8000
   ```
 3. **Integration with memos:**
   ```sh
   $ python -m memos.commands plugin create ocr http://localhost:8000
   Plugin created successfully
   ```
   ```sh
   $ python -m memos.commands plugin ls
   ID  Name    Description    Webhook URL
    1  ocr                    http://localhost:8000/
   ```
   ```sh
   $ python -m memos.commands plugin bind --lib 1 --plugin 1
   Plugin bound to library successfully
   ```
 ## Endpoints
 - `GET /`: Health check endpoint. Returns `{"healthy": True}` if the service is running.
 - `POST /`: OCR endpoint. Accepts an `Entity` object and a `Location` header. Performs OCR on the image file and updates the entity's metadata with the OCR results.
 ## Metadata
 The OCR results are stored in the metadata field named `ocr_result` with the following structure:
--- a/memos/plugins/ocr/fonts/simfang.ttf
+++ b/memos/plugins/ocr/fonts/simfang.ttf
--- a/memos/plugins/ocr/models/ch_PP-OCRv4_det_infer.onnx
+++ b/memos/plugins/ocr/models/ch_PP-OCRv4_det_infer.onnx
--- a/memos/plugins/ocr/models/ch_PP-OCRv4_rec_infer.onnx
+++ b/memos/plugins/ocr/models/ch_PP-OCRv4_rec_infer.onnx
--- a/memos/plugins/ocr/models/ch_ppocr_mobile_v2.0_cls_train.onnx
+++ b/memos/plugins/ocr/models/ch_ppocr_mobile_v2.0_cls_train.onnx
--- a/memos/plugins/ocr/ppocr-gpu.yaml
+++ b/memos/plugins/ocr/ppocr-gpu.yaml
@ -0,0 +1,41 @@
 Global:
    text_score: 0.5
    use_det: true
    use_cls: true
    use_rec: true
    print_verbose: false
    min_height: 30
    width_height_ratio: 40
 Det:
    use_cuda: true
    model_path: models/ch_PP-OCRv4_det_infer.onnx
    limit_side_len: 1500
    limit_type: min
    thresh: 0.3
    box_thresh: 0.3
    max_candidates: 1000
    unclip_ratio: 1.6
    use_dilation: true
    score_mode: fast
 Cls:
    use_cuda: true
    model_path: models/ch_ppocr_mobile_v2.0_cls_train.onnx
    cls_image_shape: [3, 48, 192]
    cls_batch_num: 6
    cls_thresh: 0.9
    label_list: ['0', '180']
 Rec:
    use_cuda: true
    model_path: models/ch_PP-OCRv4_rec_infer.onnx
    rec_img_shape: [3, 48, 320]
    rec_batch_num: 6
--- a/memos/plugins/ocr/ppocr.yaml
+++ b/memos/plugins/ocr/ppocr.yaml
@ -0,0 +1,41 @@
 Global:
    text_score: 0.5
    use_det: true
    use_cls: true
    use_rec: true
    print_verbose: false
    min_height: 30
    width_height_ratio: 40
 Det:
    use_cuda: false
    model_path: models/ch_PP-OCRv4_det_infer.onnx
    limit_side_len: 1500
    limit_type: min
    thresh: 0.3
    box_thresh: 0.3
    max_candidates: 1000
    unclip_ratio: 1.6
    use_dilation: true
    score_mode: fast
 Cls:
    use_cuda: false
    model_path: models/ch_ppocr_mobile_v2.0_cls_train.onnx
    cls_image_shape: [3, 48, 192]
    cls_batch_num: 6
    cls_thresh: 0.9
    label_list: ['0', '180']
 Rec:
    use_cuda: false
    model_path: models/ch_PP-OCRv4_rec_infer.onnx
    rec_img_shape: [3, 48, 320]
    rec_batch_num: 6
--- a/memos/plugins/ocr/requirements.txt
+++ b/memos/plugins/ocr/requirements.txt
@ -0,0 +1,4 @@
 rapidocr_onnxruntime
 httpx
 fastapi
 # Note: If you are using GPU, you should add onnxruntime-gpu to the requirements
--- a/memos/plugins/ocr/server.py
+++ b/memos/plugins/ocr/server.py
@ -0,0 +1,209 @@
 from PIL import Image
 import numpy as np
 import logging
 from fastapi import FastAPI, Body, HTTPException
 import base64
 import io
 import asyncio
 from pydantic import BaseModel, Field
 from typing import List
 from multiprocessing import Pool
 import threading
 import time
 import uvicorn
 # Configure logger
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 app = FastAPI()
 # 创建进程池
 process_pool = None
 def init_worker(use_gpu):
    global ocr
    ocr = init_ocr(use_gpu)
 def init_process_pool(max_workers, use_gpu):
    global process_pool
    process_pool = Pool(
        processes=max_workers, initializer=init_worker, initargs=(use_gpu,)
    )
 def init_ocr(use_gpu):
    if use_gpu:
        try:
            from rapidocr_paddle import RapidOCR as RapidOCRPaddle
            ocr = RapidOCRPaddle(
                det_use_cuda=True, cls_use_cuda=True, rec_use_cuda=True
            )
            logger.info("Initialized OCR with RapidOCR Paddle (GPU)")
        except ImportError:
            logger.error(
                "Failed to import rapidocr_paddle. Make sure it's installed for GPU usage."
            )
            raise
    else:
        try:
            from rapidocr_onnxruntime import RapidOCR
            ocr = RapidOCR(config_path="ppocr.yaml")
            logger.info("Initialized OCR with RapidOCR ONNX Runtime (CPU)")
        except ImportError:
            logger.error(
                "Failed to import rapidocr_onnxruntime. Make sure it's installed for CPU usage."
            )
            raise
    return ocr
 def convert_ocr_results(results):
    if results is None:
        return []
    converted = []
    for result in results:
        item = {"dt_boxes": result[0], "rec_txt": result[1], "score": result[2]}
        converted.append(item)
    return converted
 def predict(image_data):
    global ocr
    if ocr is None:
        raise ValueError("OCR engine not initialized")
    image = Image.open(io.BytesIO(image_data))
    img_array = np.array(image)
    results, _ = ocr(img_array)
    converted_results = convert_ocr_results(results)
    return converted_results
 def convert_to_python_type(item):
    if isinstance(item, np.ndarray):
        return item.tolist()
    elif isinstance(item, np.generic):  # This includes numpy scalars like numpy.float32
        return item.item()
    elif isinstance(item, list):
        return [convert_to_python_type(sub_item) for sub_item in item]
    elif isinstance(item, dict):
        return {key: convert_to_python_type(value) for key, value in item.items()}
    else:
        return item
 async def async_predict(image_data):
    loop = asyncio.get_running_loop()
    results = await loop.run_in_executor(
        None, process_pool.apply, predict, (image_data,)
    )
    return results
 class OCRResult(BaseModel):
    dt_boxes: List[List[float]] = Field(..., description="Bounding box coordinates")
    rec_txt: str = Field(..., description="Recognized text")
    score: float = Field(..., description="Confidence score")
@app.post("/predict", response_model=List[OCRResult])
 async def predict_base64(image_base64: str = Body(..., embed=True)):
    try:
        if not image_base64:
            raise HTTPException(status_code=400, detail="Missing image_base64 field")
        # Remove header part if present
        if image_base64.startswith("data:image"):
            image_base64 = image_base64.split(",")[1]
        # Decode the base64 image
        image_data = base64.b64decode(image_base64)
        # 直接传递图像数据给async_predict
        ocr_result = await async_predict(image_data)
        return convert_to_python_type(ocr_result)
    except Exception as e:
        logging.error(f"Error during OCR processing: {str(e)}")
        raise HTTPException(status_code=500, detail=str(e))
 shutdown_event = threading.Event()
 def signal_handler(signum, frame):
    logger.info("Received interrupt signal. Initiating shutdown...")
    shutdown_event.set()
 def run_server(app, host, port):
    config = uvicorn.Config(app, host=host, port=port, loop="asyncio")
    server = uvicorn.Server(config)
    server.install_signal_handlers = (
        lambda: None
    )  # Disable Uvicorn's own signal handlers
    async def serve():
        await server.serve()
    thread = threading.Thread(target=asyncio.run, args=(serve(),))
    thread.start()
    try:
        while not shutdown_event.is_set():
            time.sleep(1)
    except KeyboardInterrupt:
        logger.info("Keyboard interrupt received. Initiating shutdown...")
    finally:
        shutdown_event.set()
        logger.info("Stopping the server...")
        asyncio.run(server.shutdown())
        thread.join()
        logger.info("Server stopped.")
 if __name__ == "__main__":
    import uvicorn
    import argparse
    parser = argparse.ArgumentParser(description="OCR Service")
    parser.add_argument(
        "--port",
        type=int,
        default=8000,
        help="Port to run the OCR service on",
    )
    parser.add_argument(
        "--max-workers",
        type=int,
        default=1,
        help="Maximum number of worker threads for OCR processing",
    )
    parser.add_argument(
        "--gpu",
        action="store_true",
        help="Use GPU for OCR processing",
    )
    args = parser.parse_args()
    port = args.port
    max_workers = args.max_workers
    use_gpu = args.gpu
    try:
        init_process_pool(max_workers, use_gpu)
        run_server(app, "0.0.0.0", port)
    finally:
        logger.info("Shutting down process pool...")
        if process_pool:
            process_pool.close()
            process_pool.join()
        logger.info("Process pool shut down.")
--- a/pyproject.toml
+++ b/pyproject.toml
@ -29,6 +29,7 @@ dependencies = [
    "pillow",
    "piexif",
    "imagehash",
    "rapidocr_onnxruntime",
    "screeninfo",
    "pywin32; sys_platform == 'win32'",
    "psutil; sys_platform == 'win32'",