大模型低显存推理优化：Offload 技术详解

大模型低显存推理优化：Offload 技术详解 | 极客日志

import torch

my_model = ModelClass(...)
state_dict = torch.load(checkpoint_file)
my_model.load_state_dict(state_dict)

import torch
from accelerate import init_empty_weights, load_checkpoint_and_dispatch

with init_empty_weights():
    model = MyModel(...)

model = load_checkpoint_and_dispatch(
    model, checkpoint=checkpoint_file, device_map="auto"
)

input = torch.randn(2,3)
input = input.to("cuda")
output = model(input)

device_map = {"block1": 0, "block2.linear1": 0, "block2.linear2": 1, "block2.linear3": 1}

max_memory={0: "10GiB", 1: "20GiB", 2: "20GiB", "cpu": "60GiB"}

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

path = "/workspace/models/Qwen1.5-72B-Chat"
model = AutoModelForCausalLM.from_pretrained(
    path, 
    device_map="auto", 
    max_memory={0: "10GiB", 1: "20GiB", 2: "20GiB", "cpu": "60GiB"},
    offload_folder="offload",
    offload_state_dict = True, 
    torch_dtype=torch.float16
)
model.hf_device_map

{
	'model.embed_tokens': 0,
	'model.layers.0': 0,
	'model.layers.1': 0,
	'model.layers.2': 0,
	'model.layers.3': 1,
        ...
	'model.layers.14': 1,
	'model.layers.15': 2,
	...
	'model.layers.26': 2,
	'model.layers.27': 'cpu',
	...
	'model.layers.61': 'cpu',
	'model.layers.62': 'disk',
	...
	'model.layers.79': 'disk',
	'model.norm': 'disk',
	'lm_head': 'disk'
}

>>> import torch
>>> from transformers import AutoTokenizer, AutoModelForCausalLM
>>> ckpt = "microsoft/Phi-3-mini-4k-instruct"

>>> tokenizer = AutoTokenizer.from_pretrained(ckpt)
>>> model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16).to("cuda:0")
>>> inputs = tokenizer("Fun fact: The shortest", return_tensors="pt").to(model.device)

>>> out = model.generate(**inputs, do_sample=False, max_new_tokens=23, cache_implementation="offloaded")
>>> print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
Fun fact: The shortest war in history was between Britain and Zanzibar on August 27, 1896.

>>> out = model.generate(**inputs, do_sample=False, max_new_tokens=23)
>>> print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
Fun fact: The shortest war in history was between Britain and Zanzibar on August 27, 1896.

>>> import torch
>>> from transformers import AutoTokenizer, AutoModelForCausalLM
>>> def resilient_generate(model, *args, **kwargs):
...     oom = False
...     try:
...         return model.generate(*args, **kwargs)
...     except torch.cuda.OutOfMemoryError as e:
...         print(e)
...         print("retrying with cache_implementation='offloaded'")
...         oom = True
...     if oom:
...         torch.cuda.empty_cache()
...         kwargs["cache_implementation"] = "offloaded"
...         return model.generate(*args, **kwargs)
...

>>> ckpt = "microsoft/Phi-3-mini-4k-instruct"
>>> tokenizer = AutoTokenizer.from_pretrained(ckpt)
>>> model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16).to("cuda:0")
>>> prompt = ["okay "*1000 + "Fun fact: The most"]
>>> inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
>>> beams = { "num_beams": 40, "num_beam_groups": 40, "num_return_sequences": 40, "diversity_penalty": 1.0, "max_new_tokens": 23, "early_stopping": True, }
>>> out = resilient_generate(model, **inputs, **beams)
>>> responses = tokenizer.batch_decode(out[:,-28:], skip_special_tokens=True)

# Launch with `deepspeed deepspeed-zero-inference.py`

import torch
import deepspeed
import os
import time
from transformers.deepspeed import HfDeepSpeedConfig
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM

local_rank = int(os.getenv("LOCAL_RANK", "0"))
world_size = int(os.getenv("WORLD_SIZE", "1"))

model_name = "/workspace/models/llama-7b-hf"

def run_zero_inference():
    ds_config = {
        "fp16": {"enabled": True},
        "bf16": {"enabled": False},
        "zero_optimization": {
            "stage": 3,
            "offload_param": {
                "device": "cpu",
            },
        },
        "train_micro_batch_size_per_gpu": 1,
    }
    
    # 与 HuggingFace 共享 DeepSpeed 配置，以便我们可以正确使用 zero stage 3 加载大模型
    hfdsc = HfDeepSpeedConfig(ds_config)

    # Load the model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name, torch_dtype=torch.float16
    )

    # Initialize DeepSpeed
    ds_engine = deepspeed.initialize(model=model, config_params=ds_config)[0]
    ds_engine.module.eval()
    model = ds_engine.module

    # Run inference
    start_time = time.time()
    inputs = tokenizer.encode("DeepSpeed is", return_tensors="pt").to(
        f"cuda:{local_rank}"
    )
    outputs = model.generate(inputs, max_new_tokens=20)
    output_str = tokenizer.decode(outputs[0])
    end_time = time.time()
    print(output_str)
    print("ZeRO-inference time:", end_time - start_time)
    print("-------------------------------")

def run_deepspeed_inference():
    # Load the model on meta tensors
    config = AutoConfig.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
    with deepspeed.OnDevice(dtype=torch.float16, device="meta", enabled=True):
        model = AutoModelForCausalLM.from_config(config, torch_dtype=torch.float16)

    # Define the checkpoint dict. You may need to convert *.safetensors to
    # *.bin for this work. Make sure you get all the *.bin and *.pt files in
    # the checkpoint_files list.
    checkpoint_dir = "/workspace/models/llama-7b-hf"
    checkpoint_files = [
        os.path.join(checkpoint_dir, f"pytorch_model-{i:05d}-of-00033.bin")
        # os.path.join(checkpoint_dir, f"model-{i:05d}-of-00033.safetensors")
        for i in range(1, 34)
    ]
    checkpoint_dict = {
        "type": "DS_MODEL",
        "checkpoints": checkpoint_files,
        "version": 1.0,
    }
    print(checkpoint_dict)

    model = deepspeed.init_inference(
        model,
        replace_with_kernel_inject=False,
        tensor_parallel={"tp_size": world_size},
        #replace_method="auto",
        #replace_with_kernel_inject=True,
        #mp_size=world_size,
        dtype=torch.float16,
        checkpoint=checkpoint_dict,
    )

    # Run inference
    start_time = time.time()
    inputs = tokenizer.encode("DeepSpeed is", return_tensors="pt").to(
        f"cuda:{local_rank}"
    )
    outputs = model.generate(inputs, max_new_tokens=20)
    output_str = tokenizer.decode(outputs[0])
    end_time = time.time()
    print(output_str)
    print("DeepSpeed-inference time:", end_time - start_time)

if __name__ == "__main__":
    run_zero_inference()
    run_deepspeed_inference()

通过在模型运行推理之前调用 set_kv_cache_offload() 函数，为 HF 模型启用 KV 缓存卸载。该函数适当修改 HF 模型的 forward 函数，以触发注意力模块中的卸载行为。
初始化一个空的 CPU 张量缓冲区来保存 KV 缓存。
KV cache 张量的大小为 2 * num_layers * batch_size * max_seq_len * hidden_size，其中 2 表示 K 值和 V 值，num_layers 是 Transformer 块的数量，batch_size 是推理批量大小，max_seq_len 是提示和生成的 Token 的总长度，hidden_size 是模型的隐藏维度。
如果 K 值和 V 值需要不同的布局，则可以使用两个单独的张量缓冲区来保存每个值。
空张量分配可以在模型初始化阶段或推理中的提示处理阶段完成。虽然理论上在模型初始化时初始化空张量会提高 prompt 阶段的吞吐量，但是实验表明，在提示处理中分配 KV cache 张量实际上会在测试的硬件/软件环境中带来稍微更好的整体吞吐量。因此，在当前的实现中采用这种方法。
在提示处理阶段将 KV 值传送到 CPU buffer。
在模型层级将相应的张量缓冲片传递给各个 transformer 块，然后在提示阶段将生成的 KV 值传递给 CPU buffer。
由于这种 GPU 到 CPU 的数据传输比 KV 值被投影 (projected) 早发生，因此我们将其放入不同的 GPU stream 中，并将其与以下注意力块中的计算并行化。
解码阶段使用 CPU KV cache：在解码阶段，当前 Token 的 KV 值被投影 (projected) 后，将其传输到 CPU KV 缓存。然后，CPU 上的完整 KV 值将用于注意力分数计算。
计算注意力分数。
可以在 CPU 或 GPU 上完成。如果在 CPU 上完成（计算速度慢），则需要将 Q 值（较少的数据）传输到 CPU；如果在 GPU 上完成（快速计算），KV 值（更多数据）需要传输到 GPU。
由于注意力分数计算通常受到内存带宽的限制，如果有合适的 CPU，前一种方法可以提供更好的性能。因此，当前的实现在 CPU 上计算注意力分数。
注意：如果在 CPU 上计算，则需要在随后的输出投影线性层之前将注意力输出传输回 GPU。

deeepspeed --num_gpus=2 your_program.py <normal cl args> --do_eval --deepspeed ds_config.json

#!/usr/bin/env python

# This script demonstrates how to use Deepspeed ZeRO in an inference mode when one can't fit a model
# into a single GPU
#
# 1. Use 1 GPU with CPU offload
# 2. Or use multiple GPUs instead
#
# First you need to install deepspeed: pip install deepspeed
#
# Here we use a 3B "bigscience/T0_3B" model which needs about 15GB GPU RAM - so 1 largish or 2
# small GPUs can handle it. or 1 small GPU and a lot of CPU memory.
#
# To use a larger model like "bigscience/T0" which needs about 50GB, unless you have an 80GB GPU -
# you will need 2-4 gpus. And then you can adapt the script to handle more gpus if you want to
# process multiple inputs at once.
#
# The provided deepspeed config also activates CPU memory offloading, so chances are that if you
# have a lot of available CPU memory and you don't mind a slowdown you should be able to load a
# model that doesn't normally fit into a single GPU. If you have enough GPU memory the program will
# run faster if you don't want offload to CPU - so disable that section then.
#
# To deploy on 1 gpu:
#
# deepspeed --num_gpus 1 t0.py
# or:
# python -m torch.distributed.run --nproc_per_node=1 t0.py
#
# To deploy on 2 gpus:
#
# deepspeed --num_gpus 2 deepspeed-inference-zero.py
# 既然指定了 GPU 的 ID，那么就不需要再设置'--num_nodes'、'--num_gpus'
# deepspeed --include=localhost:6,7 deepspeed-inference-zero.py 
# or:
# python -m torch.distributed.run --nproc_per_node=2 t0.py

from transformers import AutoTokenizer, AutoConfig, AutoModelForSeq2SeqLM,AutoModelForCausalLM
from transformers.integrations import HfDeepSpeedConfig
import deepspeed
import os
import torch

os.environ["TOKENIZERS_PARALLELISM"] = "false"  # To avoid warnings about parallelism in tokenizers

# distributed setup
local_rank = int(os.getenv("LOCAL_RANK", "0"))
world_size = int(os.getenv("WORLD_SIZE", "1"))
torch.cuda.set_device(local_rank)
deespeed.init_distributed()

#model_name = "bigscience/T0_3B"
model_name = "/workspace/models/Qwen2-7B-Instruct"

config = AutoConfig.from_pretrained(model_name)
# model_hidden_size = config.d_model
model_hidden_size = config.hidden_size

# 批处理大小必须被 world_size 整除，但可以大于 world_size
# batch size has to be divisible by world_size, but can be bigger than world_size
train_batch_size = 1 * world_size

# ds_config notes
#
# - enable bf16 if you use Ampere or higher GPU - this will run in mixed precision and will be
# faster.
#
# - for older GPUs you can enable fp16, but it'll only work for non-bf16 pretrained models - e.g.
# all official t5 models are bf16-pretrained
#
# - set offload_param.device to "none" or completely remove the `offload_param` section if you don't
# - want CPU offload
#
# - if using `offload_param` you can manually finetune stage3_param_persistence_threshold to control
# - which params should remain on gpus - the larger the value the smaller the offload size
#
# For in-depth info on Deepspeed config see
# https://huggingface.co/docs/transformers/main/main_classes/deepspeed

# keeping the same format as json for consistency, except it uses lower case for true/false
# fmt: off
ds_config = {
    "fp16": {
        "enabled": False
    },
    "bf16": {
        "enabled": False
    },
    "zero_optimization": {
        "stage": 3,
        "offload_param": {
            "device": "cpu",
            "pin_memory": True
        },
        "overlap_comm": True,
        "contiguous_gradients": True,
        "reduce_bucket_size": model_hidden_size * model_hidden_size,
        "stage3_prefetch_bucket_size": 0.9 * model_hidden_size * model_hidden_size,
        "stage3_param_persistence_threshold": 10 * model_hidden_size
    },
    "steps_per_print": 2000,
    "train_batch_size": train_batch_size,
    "train_micro_batch_size_per_gpu": 1,
    "wall_clock_breakdown": False
}
# fmt: on

# next line instructs transformers to partition the model directly over multiple gpus using
deeepseed.zero.Init when model's `from_pretrained` method is called.
#
# **it has to be run before loading the model AutoModelForSeq2SeqLM.from_pretrained(model_name)**
#
# otherwise the model will first be loaded normally and only partitioned at forward time which is
# less efficient and when there is little CPU RAM may fail
dschf = HfDeepSpeedConfig(ds_config)  # keep this object alive

# now a model can be loaded.
#model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# Model type should be one of BartConfig, BigBirdPegasusConfig, BlenderbotConfig, BlenderbotSmallConfig, EncoderDecoderConfig, FSMTConfig, GPTSanJapaneseConfig, LEDConfig, LongT5Config, M2M100Config, MarianConfig, MBartConfig, MT5Config, MvpConfig, NllbMoeConfig, PegasusConfig, PegasusXConfig, PLBartConfig, ProphetNetConfig, SeamlessM4TConfig, SeamlessM4Tv2Config, SwitchTransformersConfig, T5Config, UMT5Config, XLMProphetNetConfig
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)

# initialise Deepspeed ZeRO and store only the engine object
ds_engine = deepspeed.initialize(model=model, config_params=ds_config)[0]
ds_engine.module.eval()  # inference

# Deepspeed ZeRO can process unrelated inputs on each GPU. So for 2 gpus you process 2 inputs at once.
# If you use more GPUs adjust for more.
# And of course if you have just one input to process you then need to pass the same string to both gpus
# If you use only one GPU, then you will have only rank 0.
rank = torch.distributed.get_rank()
if rank == 0:
    text_in = "Is this review positive or negative? Review: this is the best cast iron skillet you will ever buy"
elif rank == 1:
    text_in = "Is this review positive or negative? Review: this is the worst restaurant ever"

tokenizer = AutoTokenizer.from_pretrained(model_name)
inputs = tokenizer.encode(text_in, return_tensors="pt").to(device=local_rank)
with torch.no_grad():
    outputs = ds_engine.module.generate(inputs, max_new_tokens=20, synced_gpus=True)
text_out = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"rank{rank}:\n   in={text_in}\n  out={text_out}")

# deepspeed --num_gpus 2 t0.py

rank0:
   in=Is this review positive or negative? Review: this is the best cast iron skillet you will ever buy
  out=Positive
rank1:
   in=Is this review positive or negative? Review: this is the worst restaurant ever
  out=negative

from llama_cpp import Llama
import time

llm = Llama(
      #model_path = "/workspace/models/Qwen2-7B-Instruct/ggml-model-f16.gguf",
      model_path="/workspace/models/Qwen1.5-72B-Chat/ggml-model-f16.gguf",
      n_gpu_layers = 30,
      # n_gpu_layers=-1, # Uncomment to use GPU acceleration
      # seed=1337, # Uncomment to set a specific seed
      # n_ctx=2048, # Uncomment to increase the context window
)


start = time.time()
output = llm(
      "Q:保持健康的秘诀有哪些？A: ", # Prompt
      max_tokens=32, # Generate up to 32 tokens, set to None to generate up to the end of the context window
      #stream=True,
      stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
      echo=True # Echo the prompt back in the output
) # Generate a completion, can also call create_completion
print(output)
infer_time = time.time() - start
print("耗时：", infer_time)

from vllm import LLM, SamplingParams

# Sample prompts.
prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

# Create an LLM.
llm = LLM(model="meta-llama/Llama-2-13b-chat-hf", cpu_offload_gb=10)
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

# Convert model as normal. Assume hugging face model is in llama-7b-hf/
python3 examples/llama/convert_checkpoint.py \
    --model_dir llama-7b-hf/ \
    --output_dir /tmp/llama_7b/trt_ckpt/fp16/1-gpu/ \
    --dtype float16

# Build engine that enabled Weight Streaming.
trtllm-build \
    --checkpoint_dir /tmp/llama_7b/trt_ckpt/fp16/1-gpu/ \
    --output_dir /tmp/llama_7b/trt_engines/fp16/1-gpu/ \
    --weight_streaming \
    --strongly_typed \
    --gemm_plugin disable \
    --max_batch_size 128 \
    --max_input_len 512 \
    --max_output_len 50

# Run the engine with 20% weights in GPU memory.
python3 examples/summarize.py \
    --engine_dir /tmp/llama_7b/trt_engines/fp16/1-gpu/ \
    --batch_size 1 \
    --test_trt_llm \
    --hf_model_dir llama-7b-hf/ \
    --data_type fp16 \
    --gpu_weights_percent 0.2

parameters: {
  key: "gpu_weights_percent"
    value: {
      string_value: "0.2"
  }
}

parameters: {
  key: "kv_cache_host_memory_bytes"
  value: {
    string_value: "45000000000"
  }
}

大模型低显存推理优化：Offload 技术详解

简述

低显存推理优化技术

Kernel 优化

压缩

Offload 技术（Dynamic Computation Offloading）/混合部署/混合推理

Offload 技术在 AI 框架中的应用

更多推荐文章

相关免费在线工具

HuggingFace Transformers

DeepSpeed ZeRO-Inference

llama.cpp

vLLM

TensorRT-LLM

总结

更多推荐文章

相关免费在线工具

大模型低显存推理优化：Offload 技术详解

简述

低显存推理优化技术

Kernel 优化

压缩

Offload 技术（Dynamic Computation Offloading）/混合部署/混合推理

Offload 技术在 AI 框架中的应用

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具

HuggingFace Transformers

DeepSpeed ZeRO-Inference

llama.cpp

vLLM

TensorRT-LLM

总结

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具