部署deepseek回顾
0 条评论安装的依赖:
apt-get install python3 python3-pip
python3 -m venv deepseek_env
source deepseek_env/bin/activate
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 #有 GPU 的情况
pip install transformers #Transformers 库
pip install fastapi uvicorn
pip install transformers huggingface_hub
pip install datasets
pip install modelscope #下载模型使用
下载模型:
modelscope download --model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --local_dir “/root/autodl-tmp”
编写一个测试程序 run_model.py
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
# 模型本地路径
model_path = "./"
# 直接加载分词器和模型
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)
# 将模型移动到GPU(如果可用)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
# 输入文本
input_text = "请用python给我打印一个helloword"
# 编码输入
inputs = tokenizer(input_text, return_tensors="pt").to(device)
# 生成输出
outputs = model.generate(**inputs, max_length=50)
# 解码输出
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(output_text)
编写一个FastAPI程序
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import List, Dict
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
# 初始化FastAPI应用
app = FastAPI()
# 加载模型和分词器
model_path = "./" # 替换为你的模型路径
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)
# 将模型移动到GPU(如果可用)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
# 定义请求体结构
class ChatHistoryItem(BaseModel):
role: str # "user" 或 "assistant"
content: str # 用户问题或模型回答
class TextGenerationRequest(BaseModel):
history: List[ChatHistoryItem] # 历史对话上下文
max_length: int = 50 # 生成文本的最大长度
temperature: float = 0.7 # 控制生成文本的随机性
top_k: int = 50 # Top-k采样
top_p: float = 0.9 # Top-p采样
# 定义API端点
@app.post("/generate/")
async def generate_text(request: TextGenerationRequest):
try:
# 添加系统提示
system_prompt = "你是一个智能助手,根据用户的提问和上下文提供准确、简洁的回答。"
context = system_prompt + "\n"
# 将历史对话拼接为上下文
for item in request.history:
if item.role == "user":
context += f"用户: {item.content}\n"
elif item.role == "assistant":
context += f"助手: {item.content}\n"
# 编码输入
inputs = tokenizer(context, return_tensors="pt").to(device)
# 检查输入长度是否超过模型限制
max_model_length = model.config.max_position_embeddings
if inputs.input_ids.shape[1] > max_model_length:
raise HTTPException(
status_code=400,
detail=f"输入长度超过模型限制({max_model_length} token)。请缩短上下文。"
)
# 生成文本
outputs = model.generate(
inputs.input_ids,
max_length=request.max_length,
temperature=request.temperature,
top_k=request.top_k,
top_p=request.top_p,
do_sample=True, # 启用采样
pad_token_id=tokenizer.eos_token_id, # 设置结束符
)
# 解码输出
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
# 提取助手的回复(去掉上下文)
assistant_response = generated_text[len(context):].strip()
# 返回生成的文本
return {"generated_text": assistant_response}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
# 运行FastAPI应用
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)