>>> import torch
>>> from transformers import AutoTokenizer, AutoModelForCausalLM
>>> ckpt = "microsoft/Phi-3-mini-4k-instruct">>> tokenizer = AutoTokenizer.from_pretrained(ckpt)
>>> model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16).to("cuda:0")
>>> inputs = tokenizer("Fun fact: The shortest", return_tensors="pt").to(model.device)
>>> out = model.generate(**inputs, do_sample=False, max_new_tokens=23, cache_implementation="offloaded")
>>> print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
Fun fact: The shortest war in history was between Britain and Zanzibar on August 27, 1896.>>> out = model.generate(**inputs, do_sample=False, max_new_tokens=23)
>>> print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
Fun fact: The shortest war in history was between Britain and Zanzibar on August 27, 1896.
#!/usr/bin/env python# This script demonstrates how to use Deepspeed ZeRO in an inference mode when one can't fit a model# into a single GPU## 1. Use 1 GPU with CPU offload# 2. Or use multiple GPUs instead## First you need to install deepspeed: pip install deepspeed## Here we use a 3B "bigscience/T0_3B" model which needs about 15GB GPU RAM - so 1 largish or 2# small GPUs can handle it. or 1 small GPU and a lot of CPU memory.## To use a larger model like "bigscience/T0" which needs about 50GB, unless you have an 80GB GPU -# you will need 2-4 gpus. And then you can adapt the script to handle more gpus if you want to# process multiple inputs at once.## The provided deepspeed config also activates CPU memory offloading, so chances are that if you# have a lot of available CPU memory and you don't mind a slowdown you should be able to load a# model that doesn't normally fit into a single GPU. If you have enough GPU memory the program will# run faster if you don't want offload to CPU - so disable that section then.## To deploy on 1 gpu:## deepspeed --num_gpus 1 t0.py# or:# python -m torch.distributed.run --nproc_per_node=1 t0.py## To deploy on 2 gpus:## deepspeed --num_gpus 2 deepspeed-inference-zero.py# 既然指定了 GPU 的 ID,那么就不需要再设置'--num_nodes'、'--num_gpus'# deepspeed --include=localhost:6,7 deepspeed-inference-zero.py # or:# python -m torch.distributed.run --nproc_per_node=2 t0.pyfrom transformers import AutoTokenizer, AutoConfig, AutoModelForSeq2SeqLM,AutoModelForCausalLM
from transformers.integrations import HfDeepSpeedConfig
import deepspeed
import os
import torch
os.environ["TOKENIZERS_PARALLELISM"] = "false"# To avoid warnings about parallelism in tokenizers# distributed setup
local_rank = int(os.getenv("LOCAL_RANK", "0"))
world_size = int(os.getenv("WORLD_SIZE", "1"))
torch.cuda.set_device(local_rank)
deespeed.init_distributed()
#model_name = "bigscience/T0_3B"
model_name = "/workspace/models/Qwen2-7B-Instruct"
config = AutoConfig.from_pretrained(model_name)
# model_hidden_size = config.d_model
model_hidden_size = config.hidden_size
# 批处理大小必须被 world_size 整除,但可以大于 world_size# batch size has to be divisible by world_size, but can be bigger than world_size
train_batch_size = 1 * world_size
# ds_config notes## - enable bf16 if you use Ampere or higher GPU - this will run in mixed precision and will be# faster.## - for older GPUs you can enable fp16, but it'll only work for non-bf16 pretrained models - e.g.# all official t5 models are bf16-pretrained## - set offload_param.device to "none" or completely remove the `offload_param` section if you don't# - want CPU offload## - if using `offload_param` you can manually finetune stage3_param_persistence_threshold to control# - which params should remain on gpus - the larger the value the smaller the offload size## For in-depth info on Deepspeed config see# https://huggingface.co/docs/transformers/main/main_classes/deepspeed# keeping the same format as json for consistency, except it uses lower case for true/false# fmt: off
ds_config = {
"fp16": {
"enabled": False
},
"bf16": {
"enabled": False
},
"zero_optimization": {
"stage": 3,
"offload_param": {
"device": "cpu",
"pin_memory": True
},
"overlap_comm": True,
"contiguous_gradients": True,
"reduce_bucket_size": model_hidden_size * model_hidden_size,
"stage3_prefetch_bucket_size": 0.9 * model_hidden_size * model_hidden_size,
"stage3_param_persistence_threshold": 10 * model_hidden_size
},
"steps_per_print": 2000,
"train_batch_size": train_batch_size,
"train_micro_batch_size_per_gpu": 1,
"wall_clock_breakdown": False
}
# fmt: on# next line instructs transformers to partition the model directly over multiple gpus using
deeepseed.zero.Init when model's `from_pretrained` method is called.
#
# **it has to be run before loading the model AutoModelForSeq2SeqLM.from_pretrained(model_name)**
#
# otherwise the model will first be loaded normally and only partitioned at forward time which is
# less efficient and when there is little CPU RAM may fail
dschf = HfDeepSpeedConfig(ds_config) # keep this object alive
# now a model can be loaded.
#model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# Model type should be one of BartConfig, BigBirdPegasusConfig, BlenderbotConfig, BlenderbotSmallConfig, EncoderDecoderConfig, FSMTConfig, GPTSanJapaneseConfig, LEDConfig, LongT5Config, M2M100Config, MarianConfig, MBartConfig, MT5Config, MvpConfig, NllbMoeConfig, PegasusConfig, PegasusXConfig, PLBartConfig, ProphetNetConfig, SeamlessM4TConfig, SeamlessM4Tv2Config, SwitchTransformersConfig, T5Config, UMT5Config, XLMProphetNetConfig
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
# initialise Deepspeed ZeRO and store only the engine object
ds_engine = deepspeed.initialize(model=model, config_params=ds_config)[0]
ds_engine.module.eval() # inference
# Deepspeed ZeRO can process unrelated inputs on each GPU. So for 2 gpus you process 2 inputs at once.
# If you use more GPUs adjust for more.
# And of course if you have just one input to process you then need to pass the same string to both gpus
# If you use only one GPU, then you will have only rank 0.
rank = torch.distributed.get_rank()
if rank == 0:
text_in = "Is this review positive or negative? Review: this is the best cast iron skillet you will ever buy"
elif rank == 1:
text_in = "Is this review positive or negative? Review: this is the worst restaurant ever"
tokenizer = AutoTokenizer.from_pretrained(model_name)
inputs = tokenizer.encode(text_in, return_tensors="pt").to(device=local_rank)
with torch.no_grad():
outputs = ds_engine.module.generate(inputs, max_new_tokens=20, synced_gpus=True)
text_out = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"rank{rank}:\n in={text_in}\n out={text_out}")
运行结果:
# deepspeed --num_gpus 2 t0.py
rank0:
in=Is this review positive or negative? Review: this is the best cast iron skillet you will ever buy
out=Positive
rank1:
in=Is this review positive or negative? Review: this is the worst restaurant ever
out=negative
from llama_cpp import Llama
import time
llm = Llama(
#model_path = "/workspace/models/Qwen2-7B-Instruct/ggml-model-f16.gguf",
model_path="/workspace/models/Qwen1.5-72B-Chat/ggml-model-f16.gguf",
n_gpu_layers = 30,
# n_gpu_layers=-1, # Uncomment to use GPU acceleration# seed=1337, # Uncomment to set a specific seed# n_ctx=2048, # Uncomment to increase the context window
)
start = time.time()
output = llm(
"Q:保持健康的秘诀有哪些?A: ", # Prompt
max_tokens=32, # Generate up to 32 tokens, set to None to generate up to the end of the context window#stream=True,
stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
echo=True# Echo the prompt back in the output
) # Generate a completion, can also call create_completionprint(output)
infer_time = time.time() - start
print("耗时:", infer_time)
from vllm import LLM, SamplingParams
# Sample prompts.
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create an LLM.
llm = LLM(model="meta-llama/Llama-2-13b-chat-hf", cpu_offload_gb=10)
# Generate texts from the prompts. The output is a list of RequestOutput objects# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
除了以上的 AI 框架中提供了像 fastllm、FlexGen、FlexFlow-Serve 中均提供了 Offload 功能,这里就不一一介绍了。
总结
本文介绍了大模型低显存推理的一些常用方法,同时,重点介绍了 Offload 技术在各个 AI 框架中的应用。整个思想在各个 AI 框架中都大同小异,从技术实现上来说,各个 AI 框架的实现略有不同。从灵活性和可控制性上面来说,我更喜欢 Huggingface Transformers。从推理速度和易用性上来说,我觉得 LLama.cpp 更好。