Windows 平台零基础部署 Qwen1.5 大模型教程

Windows 平台零基础部署 Qwen1.5 大模型教程 | 极客日志

conda --version

conda config --set show_channel_urls yes

channels:
  - defaults
show_channel_urls: true
default_channels:
  - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main
  - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/r
  - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/msys2
custom_channels:
  conda-forge: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
  msys2: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
  bioconda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
  menpo: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
  pytorch: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
  pytorch-lts: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
  simpleitk: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
  deepmodeling: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/

conda create --name llm python=3.11.5

conda info --envs

conda activate llm

git clone https://www.modelscope.cn/qwen/Qwen1.5-0.5B-Chat.git

conda install pytorch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 pytorch-cuda=12.1 -c pytorch -c nvidia

conda install conda-forge::transformers

from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cuda" # the device to load the model onto

# Now you do not need to add "trust_remote_code=True"
model = AutoModelForCausalLM.from_pretrained(
    "Qwen1.5-0.5B-Chat",  # 修改大模型位置
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("Qwen1.5-0.5B-Chat") # 修改大模型位置

# Instead of using model.chat(), we directly use model.generate()
# But you need to use tokenizer.apply_chat_template() to format your inputs as shown below
# 改成中文提问
prompt = "给我简单介绍一下大型语言模型。"
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)

# Directly use generate() and tokenizer.decode() to get the output.
# Use `max_new_tokens` to control the maximum output length.
generated_ids = model.generate(
    model_inputs.input_ids,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

# 打印一下助手回复的内容
print(response)

python qwen.py

conda install conda-forge::accelerate

from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from threading import Thread

device = "cuda"  # the device to load the model onto

# Now you do not need to add "trust_remote_code=True"
model = AutoModelForCausalLM.from_pretrained(
    "Qwen1.5-0.5B-Chat",
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("Qwen1.5-0.5B-Chat")

# Instead of using model.chat(), we directly use model.generate()
# But you need to use tokenizer.apply_chat_template() to format your inputs as shown below
# 改成中文提问
prompt = "给我简单介绍一下大型语言模型。"
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)

# Directly use generate() and tokenizer.decode() to get the output.
# Use `max_new_tokens` to control the maximum output length.
streamer = TextIteratorStreamer(
    tokenizer, skip_prompt=True, skip_special_tokens=True)
generation_kwargs = dict(model_inputs, streamer=streamer, max_new_tokens=512)
thread = Thread(target=model.generate, kwargs=generation_kwargs)

thread.start()
generated_text = ""
for new_text in streamer:
    generated_text += new_text
    print(generated_text)

conda install fastapi uvicorn

import uvicorn
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from argparse import ArgumentParser

app = FastAPI()

# 支持 CORS
app.add_middleware(
    CORSMiddleware,
    allow_origins=['*'],
    allow_credentials=True,
    allow_methods=['*'],
    allow_headers=['*'],
)


@app.get("/")
async def index():

    return {"message": "Hello World"}


def _get_args():
    parser = ArgumentParser()

    parser.add_argument('--server-port',
                        type=int,
                        default=8000,
                        help='Demo server port.')
    parser.add_argument('--server-name',
                        type=str,
                        default='127.0.0.1',
                        help='Demo server name. Default: 127.0.0.1, which is only visible from the local computer.'
                        ' If you want other computers to access your server, use 0.0.0.0 instead.',
                        )

    args = parser.parse_args()
    return args


if __name__ == '__main__':
    args = _get_args()

    uvicorn.run(app, host=args.server_name, port=args.server_port, workers=1)

python web.py

from contextlib import asynccontextmanager
import torch
import uvicorn
import time
from transformers import AutoModelForCausalLM, AutoTokenizer
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from argparse import ArgumentParser
from typing import List, Literal, Optional, Union
from pydantic import BaseModel, Field


@asynccontextmanager
async def lifespan(app: FastAPI):  # collects GPU memory
    yield
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()

app = FastAPI(lifespan=lifespan)

# 支持 CORS
app.add_middleware(
    CORSMiddleware,
    allow_origins=['*'],
    allow_credentials=True,
    allow_methods=['*'],
    allow_headers=['*'],
)


class ChatMessage(BaseModel):
    role: Literal['user', 'assistant', 'system']
    content: Optional[str]


class DeltaMessage(BaseModel):
    role: Optional[Literal['user', 'assistant', 'system']] = None
    content: Optional[str] = None


class ChatCompletionRequest(BaseModel):
    model: str
    messages: List[ChatMessage]
    stream: Optional[bool] = False


class ChatCompletionResponseChoice(BaseModel):
    index: int
    message: ChatMessage
    finish_reason: Literal['stop', 'length']


class ChatCompletionResponseStreamChoice(BaseModel):
    index: int
    delta: DeltaMessage
    finish_reason: Optional[Literal['stop', 'length']]


class ChatCompletionResponse(BaseModel):
    model: str
    object: Literal['chat.completion', 'chat.completion.chunk']
    choices: List[Union[ChatCompletionResponseChoice,
                        ChatCompletionResponseStreamChoice]]
    created: Optional[int] = Field(default_factory=lambda: int(time.time()))


@app.get("/")
async def index():

    return {"message": "Hello World"}


@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
async def create_chat_completion(request: ChatCompletionRequest):
    global model, tokenizer

    # 简单的错误校验
    if request.messages[-1].role != "user":
        raise HTTPException(status_code=400, detail="Invalid request")

    text = tokenizer.apply_chat_template(
        request.messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    # Directly use generate() and tokenizer.decode() to get the output.
    # Use `max_new_tokens` to control the maximum output length.
    generated_ids = model.generate(
        model_inputs.input_ids,
        max_new_tokens=512
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(
        generated_ids, skip_special_tokens=True)[0]

    choice_data = ChatCompletionResponseChoice(
        index=0,
        message=ChatMessage(role="assistant", content=response),
        finish_reason="stop"
    )

    return ChatCompletionResponse(model=request.model, choices=[choice_data], object="chat.completion")


def _get_args():
    parser = ArgumentParser()

    parser.add_argument(
        '-c',
        '--checkpoint-path',
        type=str,
        default='Qwen1.5-0.5B-Chat',
        help='Checkpoint name or path, default to %(default)r',
    )

    parser.add_argument('--server-port',
                        type=int,
                        default=8000,
                        help='Demo server port.')
    parser.add_argument('--server-name',
                        type=str,
                        default='127.0.0.1',
                        help='Demo server name. Default: 127.0.0.1, which is only visible from the local computer.'
                        ' If you want other computers to access your server, use 0.0.0.0 instead.',
                        )

    args = parser.parse_args()
    return args


if __name__ == '__main__':
    args = _get_args()

    # Now you do not need to add "trust_remote_code=True"
    model = AutoModelForCausalLM.from_pretrained(
        args.checkpoint_path,
        torch_dtype="auto",
        device_map="auto"
    )
    tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path)

    uvicorn.run(app, host=args.server_name, port=args.server_port, workers=1)

pip install sse_starlette

from contextlib import asynccontextmanager
from threading import Thread
import torch
import uvicorn
import time
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BatchEncoding
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from argparse import ArgumentParser
from typing import List, Literal, Optional, Union
from pydantic import BaseModel, Field
from sse_starlette.sse import EventSourceResponse


@asynccontextmanager
async def lifespan(app: FastAPI):  # collects GPU memory
    yield
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()

app = FastAPI(lifespan=lifespan)

# 支持 CORS
app.add_middleware(
    CORSMiddleware,
    allow_origins=['*'],
    allow_credentials=True,
    allow_methods=['*'],
    allow_headers=['*'],
)


class ChatMessage(BaseModel):
    role: Literal['user', 'assistant', 'system']
    content: Optional[str]


class DeltaMessage(BaseModel):
    role: Optional[Literal['user', 'assistant', 'system']] = None
    content: Optional[str] = None


class ChatCompletionRequest(BaseModel):
    model: str
    messages: List[ChatMessage]
    stream: Optional[bool] = False


class ChatCompletionResponseChoice(BaseModel):
    index: int
    message: ChatMessage
    finish_reason: Literal['stop', 'length']


class ChatCompletionResponseStreamChoice(BaseModel):
    index: int
    delta: DeltaMessage
    finish_reason: Optional[Literal['stop', 'length']]


class ChatCompletionResponse(BaseModel):
    model: str
    object: Literal['chat.completion', 'chat.completion.chunk']
    choices: List[Union[ChatCompletionResponseChoice,
                        ChatCompletionResponseStreamChoice]]
    created: Optional[int] = Field(default_factory=lambda: int(time.time()))


@app.get("/")
async def index():

    return {"message": "Hello World"}


@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
async def create_chat_completion(request: ChatCompletionRequest):
    global model, tokenizer

    # 简单的错误校验
    if request.messages[-1].role != "user":
        raise HTTPException(status_code=400, detail="Invalid request")

    text = tokenizer.apply_chat_template(
        request.messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    if request.stream:
        generate = predict(model_inputs, request.model)
        return EventSourceResponse(generate, media_type="text/event-stream")

    # Directly use generate() and tokenizer.decode() to get the output.
    # Use `max_new_tokens` to control the maximum output length.
    generated_ids = model.generate(
        model_inputs.input_ids,
        max_new_tokens=512
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(
        generated_ids, skip_special_tokens=True)[0]

    choice_data = ChatCompletionResponseChoice(
        index=0,
        message=ChatMessage(role="assistant", content=response),
        finish_reason="stop"
    )

    return ChatCompletionResponse(model=request.model, choices=[choice_data], object="chat.completion")


async def predict(model_inputs: BatchEncoding,  model_id: str):
    global model, tokenizer

    streamer = TextIteratorStreamer(
        tokenizer, skip_prompt=True, skip_special_tokens=True)
    generation_kwargs = dict(
        model_inputs, streamer=streamer, max_new_tokens=512)
    thread = Thread(target=model.generate, kwargs=generation_kwargs)

    choice_data = ChatCompletionResponseStreamChoice(
        index=0,
        delta=DeltaMessage(role="assistant"),
        finish_reason=None
    )
    chunk = ChatCompletionResponse(model=model_id, choices=[
                                   choice_data], object="chat.completion.chunk")
    yield "{}".format(chunk.model_dump_json(exclude_unset=True))

    thread.start()
    for new_text in streamer:
        choice_data = ChatCompletionResponseStreamChoice(
            index=0,
            delta=DeltaMessage(content=new_text),
            finish_reason=None
        )

        chunk = ChatCompletionResponse(model=model_id, choices=[
                                       choice_data], object="chat.completion.chunk")
        yield "{}".format(chunk.model_dump_json(exclude_unset=True))

    choice_data = ChatCompletionResponseStreamChoice(
        index=0,
        delta=DeltaMessage(),
        finish_reason="stop"
    )
    chunk = ChatCompletionResponse(model=model_id, choices=[
                                   choice_data], object="chat.completion.chunk")
    yield "{}".format(chunk.model_dump_json(exclude_unset=True))
    yield '[DONE]'


def _get_args():
    parser = ArgumentParser()

    parser.add_argument(
        '-c',
        '--checkpoint-path',
        type=str,
        default='Qwen1.5-0.5B-Chat',
        help='Checkpoint name or path, default to %(default)r',
    )

    parser.add_argument('--server-port',
                        type=int,
                        default=8000,
                        help='Demo server port.')
    parser.add_argument('--server-name',
                        type=str,
                        default='127.0.0.1',
                        help='Demo server name. Default: 127.0.0.1, which is only visible from the local computer.'
                        ' If you want other computers to access your server, use 0.0.0.0 instead.',
                        )

    args = parser.parse_args()
    return args


if __name__ == '__main__':
    args = _get_args()

    # Now you do not need to add "trust_remote_code=True"
    model = AutoModelForCausalLM.from_pretrained(
        args.checkpoint_path,
        torch_dtype="auto",
        device_map="auto"
    )
    tokenizer = AutoTokenizer.from_pretrained(args.checkpoint_path)

    uvicorn.run(app, host=args.server_name, port=args.server_port, workers=1)

Windows 平台零基础部署 Qwen1.5 大模型教程

GPU 升级到合适的驱动

安装 Anaconda

删除原有 Python

下载 Anaconda3

安装 Anaconda3

配置环境变量

检查安装成功与否

更换 conda 源

创建虚拟环境

部署大模型

下载大模型

安装依赖

安装 PyTorch

安装 transformers

配置 VSCode

尝试跑 demo

部署为 API

快速启动

接入大模型测试

增加流式支持

完善接口

总结

更多推荐文章

相关免费在线工具

Windows 平台零基础部署 Qwen1.5 大模型教程

GPU 升级到合适的驱动

安装 Anaconda

删除原有 Python

下载 Anaconda3

安装 Anaconda3

配置环境变量

检查安装成功与否

更换 conda 源

创建虚拟环境

部署大模型

下载大模型

安装依赖

安装 PyTorch

安装 transformers

配置 VSCode

尝试跑 demo

部署为 API

快速启动

接入大模型测试

增加流式支持

完善接口

总结

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具