0. 摘要:
利用FastAPI和uvicorn构建本地化部署大语言模型的接口服务,实现大模型与后续应用(如langchain等)的解耦。整个流程分为服务端与客户端。
1. 服务端
import uvicorn
from fastapi import FastAPI
from pydantic import BaseModel
import uvicorn, json, datetime
import torch
import os
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation.utils import GenerationConfig
os.environ['CUDA_VISIBLE_DEVICES'] = "5,6,7"
app = FastAPI()
class Query(BaseModel):
text: str
path = "/workdir/model/baichuan13b_chat/"
tokenizer = AutoTokenizer.from_pretrained(path, use_fast=False, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(path, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True)
model.generation_config = GenerationConfig.from_pretrained(path)
@app.post("/chat/")
async def chat(query: Query):
input_ids = tokenizer([query.text]).input_ids
output_ids = model.generate(
torch.as_tensor(input_ids).cuda(),
do_sample=False,
temperature=0.1,
repetition_penalty=1,
max_new_tokens=1024)
output_ids = output_ids[0][len(input_ids[0]):]
outputs = tokenizer.decode(output_ids, skip_special_tokens=True, spaces_between_special_tokens=False)
return {"result": outputs}
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=6667)
2. 客户端
import requests
url = "http://0.0.0.0:6667/chat/"
query = {"text": "你好,请做一段自我介绍。"}
response = requests.post(url, json=query)
if response.status_code == 200:
result = response.json()
print("BOT:", result["result"])
else:
print("Error:", response.status_code, response.text)