安装的依赖:

apt-get install python3 python3-pip
python3 -m venv deepseek_env
source deepseek_env/bin/activate

pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 #有 GPU 的情况
pip install transformers #Transformers 库




pip install fastapi uvicorn


pip install transformers huggingface_hub
pip install datasets

pip install modelscope #下载模型使用

下载模型:

modelscope download --model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --local_dir “/root/autodl-tmp”

编写一个测试程序 run_model.py


import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# 模型本地路径
model_path = "./"

# 直接加载分词器和模型
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

# 将模型移动到GPU(如果可用)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# 输入文本
input_text = "请用python给我打印一个helloword"

# 编码输入
inputs = tokenizer(input_text, return_tensors="pt").to(device)

# 生成输出
outputs = model.generate(**inputs, max_length=50)

# 解码输出
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(output_text)

编写一个FastAPI程序

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import List, Dict
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# 初始化FastAPI应用
app = FastAPI()

# 加载模型和分词器
model_path = "./"  # 替换为你的模型路径
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

# 将模型移动到GPU(如果可用)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# 定义请求体结构
class ChatHistoryItem(BaseModel):
    role: str  # "user" 或 "assistant"
    content: str  # 用户问题或模型回答

class TextGenerationRequest(BaseModel):
    history: List[ChatHistoryItem]  # 历史对话上下文
    max_length: int = 50  # 生成文本的最大长度
    temperature: float = 0.7  # 控制生成文本的随机性
    top_k: int = 50  # Top-k采样
    top_p: float = 0.9  # Top-p采样

# 定义API端点
@app.post("/generate/")
async def generate_text(request: TextGenerationRequest):
    try:
        # 添加系统提示
        system_prompt = "你是一个智能助手,根据用户的提问和上下文提供准确、简洁的回答。"
        context = system_prompt + "\n"

        # 将历史对话拼接为上下文
        for item in request.history:
            if item.role == "user":
                context += f"用户: {item.content}\n"
            elif item.role == "assistant":
                context += f"助手: {item.content}\n"

        # 编码输入
        inputs = tokenizer(context, return_tensors="pt").to(device)

        # 检查输入长度是否超过模型限制
        max_model_length = model.config.max_position_embeddings
        if inputs.input_ids.shape[1] > max_model_length:
            raise HTTPException(
                status_code=400,
                detail=f"输入长度超过模型限制({max_model_length} token)。请缩短上下文。"
            )

        # 生成文本
        outputs = model.generate(
            inputs.input_ids,
            max_length=request.max_length,
            temperature=request.temperature,
            top_k=request.top_k,
            top_p=request.top_p,
            do_sample=True,  # 启用采样
            pad_token_id=tokenizer.eos_token_id,  # 设置结束符
        )

        # 解码输出
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # 提取助手的回复(去掉上下文)
        assistant_response = generated_text[len(context):].strip()

        # 返回生成的文本
        return {"generated_text": assistant_response}

    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

# 运行FastAPI应用
if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)