开源大模型本地部署

开源大模型本地部署 | 极客日志

# Please install OpenAI SDK first: pip3 install openai
from openai import OpenAI

client = OpenAI(
    api_key="YOUR_API_KEY",
    base_url="https://api.deepseek.com"
)

response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[
        {"role": "system", "content": "You are a helpful assistant"},
        {"role": "user", "content": "明月几时有"},
    ],
    stream=False,
)

print(response.choices[0].message.content)

pip install openai

from openai import OpenAI

client = OpenAI(
    api_key="YOUR_API_KEY",
    base_url="https://api.deepseek.com"
)

response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[
        {"role": "system", "content": "You are a helpful assistant"},
        {"role": "user", "content": "明月几时有"},
    ],
    stream=True,
)

# 流式输出
out = []
for chunk in response:
    print(chunk.choices[0].delta.content)
    out.append(chunk.choices[0].delta.content)
    print('-' * 10)
    print(''.join(out))

import time

def test():
    # 生成器函数
    for i in range(10):
        time.sleep(1)
        yield i

if __name__ == "__main__":
    aaa = test()
    print(aaa)  # aaa 是一个生成器，可以想象成一个队列，每读取一次，就会执行一次函数体
    for a in aaa:
        print(a)  # 读取生成器中的值

项目	流式输出	非流式输出
返回方式	边生成边返回	全部生成后一次返回
响应速度	快	慢（尤其是长文本）
使用体验	更自然（打字式）	等待过程较长
编程复杂度	稍复杂（需拼接）	简单
适用场景	对话生成、直播问答	简短回复、结构化处理

from openai import OpenAI

class DeepseekAPI:
    def __init__(self, api_key):  # 初始化方法
        self.api_key = api_key  # API 密钥
        self.client = OpenAI(
            api_key=api_key, base_url="https://api.deepseek.com"
        )  # 实例化 OpenAI 客户端

    def inference(self, messages):
        response = self.client.chat.completions.create(
            model="deepseek-chat",
            messages=messages,  # 消息内容
            stream=False,  # 设置为 False 以获取完整响应
        )
        return response.choices[0].message.content  # 返回完整响应

# 测试代码
if __name__ == "__main__":
    api_key = "YOUR_API_KEY"  # API 密钥
    messages = [
        {"role": "system", "content": "你是一名 AI 助手"},
        {"role": "user", "content": "请简要介绍一下你自己"},
    ]  # 定义消息内容
    stream = False  # 设置为 True 以获取流式输出，False 以获取完整响应
    deepseek_api = DeepseekAPI(api_key)  # 实例化 DeepseekAPI 类
    result = deepseek_api.inference(messages)  # 调用推理方法
    print(result)  # 打印响应内容

# 流式输出
from openai import OpenAI

class DeepseekAPI:
    def __init__(self, api_key):  # 初始化方法
        self.api_key = api_key  # API 密钥
        self.client = OpenAI(
            api_key=api_key, base_url="https://api.deepseek.com"
        )  # 实例化 OpenAI 客户端

    def inference(self, messages):
        response = self.client.chat.completions.create(
            model="deepseek-chat",
            messages=messages,  # 消息内容
            stream=True,  # 设置为 True 以获取流式响应
        )
        for chunk in response:  # 遍历响应的每个块
            if chunk.choices:  # 如果块中有返回内容
                content = chunk.choices[0].delta.content  # 获取内容
                yield content  # 逐块返回内容

# 测试代码
if __name__ == "__main__":
    api_key = "YOUR_API_KEY"  # API 密钥
    messages = [
        {"role": "system", "content": "你是一名乐于助人的人工智能助手"},
        {"role": "user", "content": "请简要介绍一下你自己"},
    ]  # 定义消息内容
    stream = False  # 设置为 True 以获取流式输出，False 以获取完整响应
    deepseek_api = DeepseekAPI(api_key)  # 实例化 DeepseekAPI 类
    result = deepseek_api.inference(messages)  # 调用推理方法
    for chunk in result:  # 遍历响应的每个块
        print(chunk,)  # 打印每个块的内容

命令	说明
`ollama pull <模型名>`	下载模型（如 `llama3`）
`ollama run <模型名>`	运行模型交互式对话
`ollama list`	查看已安装模型
`ollama rm <模型名>`	删除模型

ollama pull qwen:7b           # 下载 Qwen-7B
ollama run qwen:7b            # 启动聊天

POST http://localhost:11434/api/chat
Content-Type: application/json

{
  "model": "qwen3:0.6b",
  "messages": [
    { "role": "user", "content": "LLM 是什么？" }
  ],
  "stream": true
}

{
  "message": {
    "role": "assistant",
    "content": "你好！我不知道实时天气信息，但你可以查看天气预报网站获取最新天气。"
  },
  "done": true
}

set HF_ENDPOINT=https://hf-mirror.com  # 加速下载设置
huggingface-cli download deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --local-dir ./deepseek  # 下载模型文件
huggingface-cli download thenlper/gte-large --local-dir ./gte-large
huggingface-cli download BAAI/bge-base-zh --local-dir ./bge-base-zh

pip install huggingface_hub

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# 一：加载模型
model_path = r"./modeldir"  # 模型路径
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16  # 指定模型参数类型为 float16
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch_dtype).to(device)
# 加载模型并移动到 GPU
tokenizer = AutoTokenizer.from_pretrained(model_path)  # 加载分词器

# 二：设置生成参数和输入消息
gen_kwargs = {
    "max_length": 1024,  # 生成的最大长度
    "do_sample": True,  # 是否使用概率采样
    "top_k": 10,  # 采样时的前 K 个候选词，越大越随机
    "temperature": 0.7,  # 生成丰富性，越大越有创造力
    "top_p": 0.8,  # 采样时的前 P 个候选词，越大越随机
    "repetition_penalty": 1.2,  # 重复惩罚系数，越大越不容易重复
}

# 定义消息内容
messages = [
    {"role": "system", "content": "你是 AI 助手"},
    {"role": "user", "content": "明月几时有"},
]

# 三：将输入数据转换为模型可接受的格式
inputs = tokenizer.apply_chat_template(
    messages, add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True,
).to(device)
# 将输入数据移动到 GPU

# 四：生成输出
outputs = model.generate(**inputs, **gen_kwargs)
# 生成输出
outputs = outputs[:, inputs["input_ids"].shape[1]:]  # 截取生成的输出
result = tokenizer.decode(outputs[0], skip_special_tokens=True)  # 解码输出

# 五：打印结果
print(result)  # 打印结果

device
- 概念：指定模型运行的计算设备（CPU 或 GPU）。在 PyTorch 中通常为 "cpu" 或 "cuda:0"。
- 设置建议：优先使用 GPU（如 device="cuda:0"），显存不足时用 CPU。
torch_dtype
- 概念：模型张量的数据类型，如 float32（高精度）、float16 或 bfloat16（低精度，节省显存）。
- 影响：精度越高（如 float32），结果越精确，但显存占用更大。精度越低（如 float16），显存占用少，但可能损失精度或数值不稳定。
- 设置建议：GPU 推荐 torch.float16 或 bfloat16（兼容性需确认）；CPU 通常用 float32。
max_length
- 概念：生成文本的最大长度（token 数量）。
- 影响：值越大，生成内容越长，但速度越慢，且可能重复或偏离主题。值过小可能导致回答不完整。
- 设置建议：根据任务调整：对话建议 100-300，长文本生成可设 512-1024，注意模型最大限制（如 4096）。
do_sample
- 概念：是否启用采样策略（如 top_k, top_p）。若为 False，则使用贪心解码（确定性强）。
- 影响：True：输出多样化，适合创意任务。False：输出确定性强，适合事实性问题。
- 设置建议：需要多样性时设为 True，需准确性时设为 False。
top_k
- 概念：采样时保留概率最高的前 k 个 token。
- 影响：值越大（如 100），候选 token 多，输出多样但可能不相关。值越小（如 10），输出更确定但可能重复。
- 设置建议：平衡点常为 10-50；需创造性时调高，需保守时调低。
top_p（核采样）
- 概念：从累积概率超过阈值 p 的最小 token 集合中采样。
- 影响：值大（如 0.95）：候选 token 多，输出多样。值小（如 0.5）：候选 token 少，输出更集中。
- 设置建议：常用 0.7-0.95。
repetition_penalty
- 概念：惩罚重复 token 的权重（>1.0 时抑制重复，<1.0 时鼓励重复）。
- 影响：值大（如 2.0）：减少重复，但可能生成不自然内容。值小（如 1.0）：无惩罚，默认行为。
- 设置建议：通常设为 1.0-1.2，明显重复时可设 1.2-1.5。

应用场景	推荐设置
正常文本生成（如聊天）	1.1～1.3（防止重复）
模仿风格性强文本（如古诗）	1.0（或略小）
模型不断重复一句话？	适当增大 penalty（如 1.5）

Token	Logits	Softmax（T=1）	Softmax（T=0.5）	Softmax（T=2）
"猫"	4.0	0.60	0.80	0.40
"狗"	3.0	0.25	0.18	0.30
"鸟"	2.0	0.15	0.02	0.30

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

class DeepSeek:
    def __init__(self, model_path, device, torch_dtype):
        self.device = device  # 设定推理设备
        self.model = AutoModelForCausalLM.from_pretrained(
            model_path, torch_dtype=torch_dtype
        ).to(device)  # 加载模型并移动到 GPU
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)  # 加载分词器

    def inference(self, messages, gen_kwargs):
        inputs = self.tokenizer.apply_chat_template(
            messages, add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True,
        ).to(self.device)  # 将输入数据移动到 GPU
        outputs = self.model.generate(**inputs, **gen_kwargs)  # 生成输出
        outputs = outputs[:, inputs["input_ids"].shape[1]:]  # 截取生成的输出
        result = self.tokenizer.decode(outputs[0], skip_special_tokens=True)  # 解码输出
        return result

if __name__ == "__main__":
    # 一：设定模型路径和设备，加载模型
    model_path = r"./modeldir"  # 替换为你的模型路径
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    torch_dtype = torch.float16
    deepseek = DeepSeek(model_path, device, torch_dtype)

    # 二：设定推理参数，推理消息
    gen_kwargs = {
        "max_length": 1024,  # 生成的最大长度
        "do_sample": True,  # 是否使用概率采样
        "top_k": 10,  # 采样时的前 K 个候选词，越大越随机
        "temperature": 0.7,  # 生成丰富性，越大越有创造力
        "top_p": 0.8,  # 采样时的前 P 个候选词，越大越随机
        "repetition_penalty": 1.2,  # 重复惩罚系数，越大越不容易重复
    }

    messages = [
        {"role": "system", "content": "你是一名乐于助人的人工智能助手"},
        {"role": "user", "content": "写一个 js 判断用户验证码代码"},
    ]  # 定义消息内容
    result = deepseek.inference(messages, gen_kwargs)  # 调用推理方法
    print(result)  # 打印结果

from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
import torch
from threading import Thread

class DeepSeek:
    def __init__(self, model_path, device, torch_dtype):
        self.device = device  # 设定推理设备
        self.model = AutoModelForCausalLM.from_pretrained(
            model_path, torch_dtype=torch_dtype
        ).to(device)  # 加载模型并移动到 GPU
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)  # 加载分词器

    def inference(self, messages, gen_kwargs):
        inputs = self.tokenizer.apply_chat_template(
            messages, add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True,
        ).to(self.device)  # 将输入数据移动到 GPU
        streamer = TextIteratorStreamer(
            self.tokenizer, skip_special_tokens=True
        )  # 创建流式输出对象
        generation_kwargs = dict(**inputs, **gen_kwargs, streamer=streamer)  # 生成参数
        thread = Thread(
            target=self.model.generate, kwargs=generation_kwargs
        )  # 创建线程
        thread.start()  # 启动线程进行生成
        # 初始化生成文本
        generated_text = ""
        for new_text in streamer:  # 流式输出生成的文本
            generated_text += new_text  # 累加生成的文本
            yield new_text  # 逐步返回生成的文本

if __name__ == "__main__":
    # 一：设定模型路径和设备，加载模型
    model_path = r"./modeldir"  # 替换为你的模型路径
    device = "cuda"  # 指定推理设备为 GPU
    torch_dtype = torch.float16
    deepseek = DeepSeek(model_path, device, torch_dtype)

    # 二：设定推理参数，推理消息
    gen_kwargs = {
        "max_length": 1024,  # 生成的最大长度
        "do_sample": True,  # 是否使用概率采样
        "top_k": 10,  # 采样时的前 K 个候选词，越大越随机
        "temperature": 0.7,  # 生成丰富性，越大越有创造力
        "top_p": 0.8,  # 采样时的前 P 个候选词，越大越随机
        "repetition_penalty": 1.2,  # 重复惩罚系数，越大越不容易重复
    }

    messages = [
        {"role": "system", "content": "你是一名乐于助人的人工智能助手"},
        {"role": "user", "content": "请简要介绍一下你自己"},
    ]  # 定义消息内容
    response = deepseek.inference(messages, gen_kwargs)  # 调用推理方法
    # 初始化结果
    result = ""
    for chunk in response:  # 流式输出生成的文本
        result += chunk  # 累加生成的文本
        print(result)  # 打印结果

开源大模型本地部署

一、大模型

1. 基本概念

2. 主要特征

3. 应用方向

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具

4. 开发流程

5. 关键要点

6. 项目介绍

二、LLM

1. API 调用

1.1 基本流程

1.2 基本特征

1.3 DeepSeek 实操

1.3.1 准备工作

1.3.2 非流式输出

赏析：

1.3.3 流式输出

1.3.4 总结对比

1.3.5 代码封装

三、大模型本地部署

1. 基本介绍

2. 线上体验

3. 本地部署

3.1 modelscope

3.2 huggingface

3.3 Ollama

3.3.1 安装

3.3.2 基本命令

3.3.3 运行模型

3.3.4 适用场景

3.3.5 请求示例

3.4 vLLM

四、DeepSeek-1.5B 本地部署

1. 特点

2. 功能

3. 模型下载

4. 非流式推理

4.1 参考代码

4.2 参数详解

4.3 代码封装

5. 流式推理

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具