refact(ml_backend): separate servers

2025-06-02 17:30:08 +00:00 · 2024-10-18 15:31:13 +08:00 · 2024-10-18 15:31:13 +08:00 · 189b82739d
commit 189b82739d
parent ad779b1b58
5 changed files with 408 additions and 263 deletions
--- a/memos_ml_backends/florence2_server.py
+++ b/memos_ml_backends/florence2_server.py
@ -0,0 +1,176 @@
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from typing import List, Dict, Any, Optional
+import httpx
+import torch
+from PIL import Image
+import base64
+import io
+from transformers import AutoProcessor, AutoModelForCausalLM
+import time
+from memos_ml_backends.schemas import (
+    ChatCompletionRequest,
+    ChatCompletionResponse,
+    ModelData,
+    ModelsResponse,
+    get_image_from_url,
+)
+
+MODEL_INFO = {"name": "florence2-base-ft", "max_model_len": 2048}
+
+# 检测可用的设备
+if torch.cuda.is_available():
+    device = torch.device("cuda")
+elif torch.backends.mps.is_available():
+    device = torch.device("mps")
+else:
+    device = torch.device("cpu")
+
+torch_dtype = (
+    torch.float32
+    if (torch.cuda.is_available() and torch.cuda.get_device_capability()[0] <= 6)
+    or (not torch.cuda.is_available() and not torch.backends.mps.is_available())
+    else torch.float16
+)
+print(f"Using device: {device}")
+
+# Load Florence-2 model
+florence_model = AutoModelForCausalLM.from_pretrained(
+    "microsoft/Florence-2-base-ft",
+    torch_dtype=torch_dtype,
+    attn_implementation="sdpa",
+    trust_remote_code=True,
+).to(device, torch_dtype)
+florence_processor = AutoProcessor.from_pretrained(
+    "microsoft/Florence-2-base-ft", trust_remote_code=True
+)
+
+app = FastAPI()
+
+
+async def generate_florence_result(text_input, image_input, max_tokens):
+    task_prompt = "<MORE_DETAILED_CAPTION>"
+    prompt = task_prompt + ""
+
+    inputs = florence_processor(
+        text=prompt, images=image_input, return_tensors="pt"
+    ).to(device, torch_dtype)
+
+    generated_ids = florence_model.generate(
+        input_ids=inputs["input_ids"],
+        pixel_values=inputs["pixel_values"],
+        max_new_tokens=max_tokens or 1024,
+        do_sample=False,
+        num_beams=3,
+    )
+
+    generated_texts = florence_processor.batch_decode(
+        generated_ids, skip_special_tokens=False
+    )
+
+    parsed_answer = florence_processor.post_process_generation(
+        generated_texts[0],
+        task=task_prompt,
+        image_size=(image_input.width, image_input.height),
+    )
+
+    return parsed_answer.get(task_prompt, "")
+
+
+@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
+async def chat_completions(request: ChatCompletionRequest):
+    try:
+        last_message = request.messages[-1]
+        text_input = last_message.get("content", "")
+        image_input = None
+
+        if isinstance(text_input, list):
+            for content in text_input:
+                if content.get("type") == "image_url":
+                    image_url = content["image_url"].get("url")
+                    image_input = await get_image_from_url(image_url)
+                    break
+            text_input = " ".join(
+                [
+                    content["text"]
+                    for content in text_input
+                    if content.get("type") == "text"
+                ]
+            )
+
+        if image_input is None:
+            raise ValueError("Image input is required")
+
+        parsed_answer = await generate_florence_result(
+            text_input, image_input, request.max_tokens
+        )
+
+        result = ChatCompletionResponse(
+            id=str(int(time.time())),
+            object="chat.completion",
+            created=int(time.time()),
+            model=request.model,
+            choices=[
+                {
+                    "index": 0,
+                    "message": {
+                        "role": "assistant",
+                        "content": parsed_answer,
+                    },
+                    "finish_reason": "stop",
+                }
+            ],
+            usage={
+                "prompt_tokens": 0,
+                "total_tokens": 0,
+                "completion_tokens": 0,
+            },
+        )
+
+        return result
+    except Exception as e:
+        print(f"Error generating chat completion: {str(e)}")
+        raise HTTPException(
+            status_code=500, detail=f"Error generating chat completion: {str(e)}"
+        )
+
+
+@app.get("/v1/models", response_model=ModelsResponse)
+async def get_models():
+    model_data = ModelData(
+        id=MODEL_INFO["name"],
+        created=int(time.time()),
+        max_model_len=MODEL_INFO["max_model_len"],
+        permission=[
+            {
+                "id": f"modelperm-{MODEL_INFO['name']}",
+                "object": "model_permission",
+                "created": int(time.time()),
+                "allow_create_engine": False,
+                "allow_sampling": False,
+                "allow_logprobs": False,
+                "allow_search_indices": False,
+                "allow_view": False,
+                "allow_fine_tuning": False,
+                "organization": "*",
+                "group": None,
+                "is_blocking": False,
+            }
+        ],
+    )
+
+    return ModelsResponse(data=[model_data])
+
+
+if __name__ == "__main__":
+    import argparse
+    import uvicorn
+
+    parser = argparse.ArgumentParser(description="Run the Florence-2 server")
+    parser.add_argument(
+        "--port", type=int, default=8000, help="Port to run the server on"
+    )
+    args = parser.parse_args()
+
+    print("Using Florence-2 model")
+    uvicorn.run(app, host="0.0.0.0", port=args.port)
--- a/memos_ml_backends/qwen2vl_server.py
+++ b/memos_ml_backends/qwen2vl_server.py
@ -0,0 +1,182 @@
+from fastapi import FastAPI, HTTPException
+import torch
+from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
+from qwen_vl_utils import process_vision_info
+import time
+from memos_ml_backends.schemas import (
+    ChatCompletionRequest,
+    ChatCompletionResponse,
+    ModelData,
+    ModelsResponse,
+    get_image_from_url,
+)
+
+MODEL_INFO = {"name": "Qwen2-VL-2B-Instruct", "max_model_len": 32768}
+
+# 检测可用的设备
+if torch.cuda.is_available():
+    device = torch.device("cuda")
+elif torch.backends.mps.is_available():
+    device = torch.device("mps")
+else:
+    device = torch.device("cpu")
+
+torch_dtype = (
+    torch.float32
+    if (torch.cuda.is_available() and torch.cuda.get_device_capability()[0] <= 6)
+    or (not torch.cuda.is_available() and not torch.backends.mps.is_available())
+    else torch.float16
+)
+print(f"Using device: {device}")
+
+# Load Qwen2VL model
+qwen2vl_model = Qwen2VLForConditionalGeneration.from_pretrained(
+    "Qwen/Qwen2-VL-2B-Instruct",
+    torch_dtype=torch_dtype,
+    device_map="auto",
+).to(device, torch_dtype)
+qwen2vl_processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4")
+
+app = FastAPI()
+
+
+async def generate_qwen2vl_result(text_input, image_input, max_tokens):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image_input},
+                {"type": "text", "text": text_input},
+            ],
+        }
+    ]
+
+    text = qwen2vl_processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    image_inputs, video_inputs = process_vision_info(messages)
+
+    inputs = qwen2vl_processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    )
+    inputs = inputs.to(device)
+
+    generated_ids = qwen2vl_model.generate(**inputs, max_new_tokens=(max_tokens or 512))
+
+    generated_ids_trimmed = [
+        out_ids[len(in_ids) :]
+        for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+
+    output_text = qwen2vl_processor.batch_decode(
+        generated_ids_trimmed,
+        skip_special_tokens=True,
+        clean_up_tokenization_spaces=False,
+    )
+
+    return output_text[0] if output_text else ""
+
+
+@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
+async def chat_completions(request: ChatCompletionRequest):
+    try:
+        last_message = request.messages[-1]
+        text_input = last_message.get("content", "")
+        image_input = None
+
+        if isinstance(text_input, list):
+            for content in text_input:
+                if content.get("type") == "image_url":
+                    image_url = content["image_url"].get("url")
+                    image_input = await get_image_from_url(image_url)
+                    break
+            text_input = " ".join(
+                [
+                    content["text"]
+                    for content in text_input
+                    if content.get("type") == "text"
+                ]
+            )
+
+        if image_input is None:
+            raise ValueError("Image input is required")
+
+        parsed_answer = await generate_qwen2vl_result(
+            text_input, image_input, request.max_tokens
+        )
+
+        result = ChatCompletionResponse(
+            id=str(int(time.time())),
+            object="chat.completion",
+            created=int(time.time()),
+            model=request.model,
+            choices=[
+                {
+                    "index": 0,
+                    "message": {
+                        "role": "assistant",
+                        "content": parsed_answer,
+                    },
+                    "finish_reason": "stop",
+                }
+            ],
+            usage={
+                "prompt_tokens": 0,
+                "total_tokens": 0,
+                "completion_tokens": 0,
+            },
+        )
+
+        return result
+    except Exception as e:
+        print(f"Error generating chat completion: {str(e)}")
+        raise HTTPException(
+            status_code=500, detail=f"Error generating chat completion: {str(e)}"
+        )
+
+
+# 添加新的 GET /v1/models 端点
+@app.get("/v1/models", response_model=ModelsResponse)
+async def get_models():
+    model_data = ModelData(
+        id=MODEL_INFO["name"],
+        created=int(time.time()),
+        max_model_len=MODEL_INFO["max_model_len"],
+        permission=[
+            {
+                "id": f"modelperm-{MODEL_INFO['name']}",
+                "object": "model_permission",
+                "created": int(time.time()),
+                "allow_create_engine": False,
+                "allow_sampling": False,
+                "allow_logprobs": False,
+                "allow_search_indices": False,
+                "allow_view": False,
+                "allow_fine_tuning": False,
+                "organization": "*",
+                "group": None,
+                "is_blocking": False,
+            }
+        ],
+    )
+
+    return ModelsResponse(data=[model_data])
+
+
+if __name__ == "__main__":
+    import argparse
+    import uvicorn
+
+    parser = argparse.ArgumentParser(description="Run the Qwen2VL server")
+    parser.add_argument(
+        "--port", type=int, default=8000, help="Port to run the server on"
+    )
+    args = parser.parse_args()
+
+    print("Using Qwen2VL model")
+    uvicorn.run(app, host="0.0.0.0", port=args.port)
--- a/memos_ml_backends/requirements.txt
+++ b/memos_ml_backends/requirements.txt
@ -2,7 +2,6 @@ einops
 timm
 transformers
 sentence-transformers
-git+https://github.com/huggingface/transformers
+transformers
 qwen-vl-utils
-auto-gptq
-optimum
+optimum
--- a/memos_ml_backends/schemas.py
+++ b/memos_ml_backends/schemas.py
@ -0,0 +1,48 @@
+from pydantic import BaseModel
+from typing import List, Dict, Any, Optional
+import httpx
+from PIL import Image
+import base64
+import io
+
+class ChatCompletionRequest(BaseModel):
+    model: str
+    messages: List[Dict[str, Any]]
+    max_tokens: Optional[int] = None
+
+class ChatCompletionResponse(BaseModel):
+    id: str
+    object: str
+    created: int
+    model: str
+    choices: List[Dict[str, Any]]
+    usage: Dict[str, int]
+
+class ModelData(BaseModel):
+    id: str
+    object: str = "model"
+    created: int
+    owned_by: str = "transformers"
+    root: str = "models"
+    parent: Optional[str] = None
+    max_model_len: int
+    permission: List[Dict[str, Any]]
+
+class ModelsResponse(BaseModel):
+    object: str = "list"
+    data: List[ModelData]
+
+async def get_image_from_url(image_url):
+    if image_url.startswith("data:image/"):
+        image_data = base64.b64decode(image_url.split(",")[1])
+        return Image.open(io.BytesIO(image_data))
+    elif image_url.startswith("file://"):
+        file_path = image_url[len("file://") :]
+        return Image.open(file_path)
+    else:
+        async with httpx.AsyncClient() as client:
+            response = await client.get(image_url)
+            response.raise_for_status()
+            image_data = response.content
+            return Image.open(io.BytesIO(image_data))
+
--- a/memos_ml_backends/server.py
+++ b/memos_ml_backends/server.py
@ -1,260 +0,0 @@
-from fastapi import FastAPI, HTTPException
-from pydantic import BaseModel
-from typing import List, Dict, Any, Optional
-import numpy as np
-import httpx
-import torch
-from PIL import Image
-import base64
-import io
-from transformers import (
-    AutoProcessor,
-    AutoModelForCausalLM,
-    Qwen2VLForConditionalGeneration,
-)
-from qwen_vl_utils import process_vision_info
-import time
-import argparse
-
-# 检测可用的设备
-if torch.cuda.is_available():
-    device = torch.device("cuda")
-elif torch.backends.mps.is_available():
-    device = torch.device("mps")
-else:
-    device = torch.device("cpu")
-
-torch_dtype = (
-    torch.float32
-    if (torch.cuda.is_available() and torch.cuda.get_device_capability()[0] <= 6)
-    or (not torch.cuda.is_available() and not torch.backends.mps.is_available())
-    else torch.float16
-)
-print(f"Using device: {device}")
-
-
-# Add a configuration option to choose the model
-parser = argparse.ArgumentParser(description="Run the server with specified model")
-parser.add_argument("--florence", action="store_true", help="Use Florence-2 model")
-parser.add_argument("--qwen2vl", action="store_true", help="Use Qwen2VL model")
-args = parser.parse_args()
-
-# Replace the USE_FLORANCE_MODEL configuration with this
-use_florence_model = args.florence if (args.florence or args.qwen2vl) else True
-
-# Initialize models based on the configuration
-if use_florence_model:
-    # Load Florence-2 model
-    florence_model = AutoModelForCausalLM.from_pretrained(
-        "microsoft/Florence-2-base-ft",
-        torch_dtype=torch_dtype,
-        attn_implementation="sdpa",
-        trust_remote_code=True,
-    ).to(device, torch_dtype)
-    florence_processor = AutoProcessor.from_pretrained(
-        "microsoft/Florence-2-base-ft", trust_remote_code=True
-    )
-else:
-    # Load Qwen2VL model
-    qwen2vl_model = Qwen2VLForConditionalGeneration.from_pretrained(
-        "Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4",
-        torch_dtype=torch_dtype,
-        device_map="auto",
-    ).to(device, torch_dtype)
-    qwen2vl_processor = AutoProcessor.from_pretrained(
-        "Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4"
-    )
-
-
-async def get_image_from_url(image_url):
-    if image_url.startswith("data:image/"):
-        image_data = base64.b64decode(image_url.split(",")[1])
-        return Image.open(io.BytesIO(image_data))
-    elif image_url.startswith("file://"):
-        file_path = image_url[len("file://") :]
-        return Image.open(file_path)
-    else:
-        async with httpx.AsyncClient() as client:
-            response = await client.get(image_url)
-            response.raise_for_status()
-            image_data = response.content
-            return Image.open(io.BytesIO(image_data))
-
-
-async def generate_florence_result(text_input, image_input, max_tokens):
-    task_prompt = "<MORE_DETAILED_CAPTION>"
-    prompt = task_prompt + ""
-
-    inputs = florence_processor(
-        text=prompt, images=image_input, return_tensors="pt"
-    ).to(device, torch_dtype)
-
-    generated_ids = florence_model.generate(
-        input_ids=inputs["input_ids"],
-        pixel_values=inputs["pixel_values"],
-        max_new_tokens=max_tokens or 1024,
-        do_sample=False,
-        num_beams=3,
-    )
-
-    generated_texts = florence_processor.batch_decode(
-        generated_ids, skip_special_tokens=False
-    )
-
-    # 处理生成的文本
-    parsed_answer = florence_processor.post_process_generation(
-        generated_texts[0],
-        task=task_prompt,
-        image_size=(image_input.width, image_input.height),
-    )
-
-    return parsed_answer.get(task_prompt, "")
-
-
-async def generate_qwen2vl_result(text_input, image_input, max_tokens):
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "image", "image": image_input},
-                {"type": "text", "text": text_input},
-            ],
-        }
-    ]
-
-    text = qwen2vl_processor.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
-    )
-
-    image_inputs, video_inputs = process_vision_info(messages)
-
-    inputs = qwen2vl_processor(
-        text=[text],
-        images=image_inputs,
-        videos=video_inputs,
-        padding=True,
-        return_tensors="pt",
-    )
-    inputs = inputs.to(device)
-
-    generated_ids = qwen2vl_model.generate(**inputs, max_new_tokens=(max_tokens or 512))
-
-    generated_ids_trimmed = [
-        out_ids[len(in_ids) :]
-        for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-    ]
-
-    output_text = qwen2vl_processor.batch_decode(
-        generated_ids_trimmed,
-        skip_special_tokens=True,
-        clean_up_tokenization_spaces=False,
-    )
-
-    return output_text[0] if output_text else ""
-
-
-app = FastAPI()
-
-
-class ChatCompletionRequest(BaseModel):
-    model: str
-    messages: List[Dict[str, Any]]
-    max_tokens: Optional[int] = None
-
-
-class ChatCompletionResponse(BaseModel):
-    id: str
-    object: str
-    created: int
-    model: str
-    choices: List[Dict[str, Any]]
-    usage: Dict[str, int]
-
-
-@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
-async def chat_completions(request: ChatCompletionRequest):
-    try:
-        last_message = request.messages[-1]
-        text_input = last_message.get("content", "")
-        image_input = None
-
-        # Process text and image input
-        if isinstance(text_input, list):
-            for content in text_input:
-                if content.get("type") == "image_url":
-                    image_url = content["image_url"].get("url")
-                    image_input = await get_image_from_url(image_url)
-                    break
-            text_input = " ".join(
-                [
-                    content["text"]
-                    for content in text_input
-                    if content.get("type") == "text"
-                ]
-            )
-
-        if image_input is None:
-            raise ValueError("Image input is required")
-
-        # Use the selected model for generation
-        if use_florence_model:
-            parsed_answer = await generate_florence_result(
-                text_input, image_input, request.max_tokens
-            )
-        else:
-            parsed_answer = await generate_qwen2vl_result(
-                text_input, image_input, request.max_tokens
-            )
-
-        result = ChatCompletionResponse(
-            id=str(int(time.time())),
-            object="chat.completion",
-            created=int(time.time()),
-            model=request.model,
-            choices=[
-                {
-                    "index": 0,
-                    "message": {
-                        "role": "assistant",
-                        "content": parsed_answer,
-                    },
-                    "finish_reason": "stop",
-                }
-            ],
-            usage={
-                "prompt_tokens": 0,
-                "total_tokens": 0,
-                "completion_tokens": 0,
-            },
-        )
-
-        return result
-    except Exception as e:
-        print(f"Error generating chat completion: {str(e)}")
-        raise HTTPException(
-            status_code=500, detail=f"Error generating chat completion: {str(e)}"
-        )
-
-
-if __name__ == "__main__":
-    import uvicorn
-
-    parser = argparse.ArgumentParser(
-        description="Run the server with specified model and port"
-    )
-    parser.add_argument("--florence", action="store_true", help="Use Florence-2 model")
-    parser.add_argument("--qwen2vl", action="store_true", help="Use Qwen2VL model")
-    parser.add_argument(
-        "--port", type=int, default=8000, help="Port to run the server on"
-    )
-    args = parser.parse_args()
-
-    if args.florence and args.qwen2vl:
-        print("Error: Please specify only one model (--florence or --qwen2vl)")
-        exit(1)
-    elif not args.florence and not args.qwen2vl:
-        print("No model specified, using default (Florence-2)")
-
-    use_florence_model = args.florence if (args.florence or args.qwen2vl) else True
-    print(f"Using {'Florence-2' if use_florence_model else 'Qwen2VL'} model")
-    uvicorn.run(app, host="0.0.0.0", port=args.port)